Source code for mzutils.nlp_tasks.ner_funcs

import numpy as np


[docs]def helper_flatten(list_of_lists): for list in list_of_lists: for item in list: yield item
[docs]def subword_tokenize_labels(tokens, labels, tokenizer, bert_special_tokens=True): """ :param tokens: something like ['John', 'Johanson', 'lives', 'in', 'Ramat', 'Gan', 'Gang', '.']. can get from tokens = tokenizer.basic_tokenizer.tokenize("John Johanson lives in Ramat Gan Gang.") tokens = nltk.word_tokenize("John Johanson lives in Ramat Gan Gang.") :param labels: [1, 2, 0, 0, 1, 2, 2, 0]. NER tokens. labels: 0 -> O out 1 -> B beginning 2 -> I continued 3 -> X sub-words that are not tagged. :param tokenizer: e.g. transformers.BertTokenizer.from_pretrained('bert-base-cased') :param bert_special_tokens: add '[CLS]' and '[SEP]' or not. :return: (['[CLS]', 'john', 'johan', '##son', 'lives', 'in', 'rama', '##t', 'gan', 'gang', '.', '[SEP]'], [101, 2198, 13093, 3385, 3268, 1999, 14115, 2102, 25957, 6080, 1012, 102], array([ 1, 2, 4, 5, 6, 8, 9, 10]), [0, 1, 2, 3, 0, 0, 1, 3, 2, 2, 0, 0]) """ assert len(tokens) == len(labels) subwords = list(map(tokenizer.tokenize, tokens)) # subwords here is flattened. subword_lengths = list(map(len, subwords)) subwords = list(helper_flatten(subwords)) if bert_special_tokens: subwords = ['[CLS]'] + subwords + ['[SEP]'] # token_start_idxs = 1 + np.cumsum([0] + subword_lengths[:-1]) bert_labels = [[label] + (sublen - 1) * [3] for sublen, label in zip(subword_lengths, labels)] bert_labels = [0] + list(helper_flatten(bert_labels)) + [0] encoded_subwords = tokenizer.encode(subwords, add_special_tokens=False) assert len(subwords) == len(bert_labels) return subwords, encoded_subwords, token_start_idxs, bert_labels
[docs]def labels_from_subword_labels(tokens, bert_labels, tokenizer, bert_special_tokens=True): """ :param tokens: something like ['John', 'Johanson', 'lives', 'in', 'Ramat', 'Gan', '.']. can get from tokens = tokenizer.basic_tokenizer.tokenize("John Johanson lives in Ramat Gan.") tokens = nltk.word_tokenize("John Johanson lives in Ramat Gan.") :param bert_labels: [0, 1, 2, 3, 0, 0, 1, 3, 2, 2, 0, 0]. NER tokens from subword_tokenize_labels. labels: 0 -> O out 1 -> B beginning 2 -> I continued 3 -> X sub-words that are not tagged. :param tokenizer: e.g. transformers.BertTokenizer.from_pretrained('bert-base-cased') :param bert_special_tokens: add '[CLS]' and '[SEP]' or not. :return: (['John', 'Johanson', 'lives', 'in', 'Ramat', 'Gan', 'Gang', '.'], [1, 2, 0, 0, 1, 2, 2, 0]) """ if bert_special_tokens: bert_labels = bert_labels[1:-1] # remove '[CLS]' and '[SEP]' subwords = list(map(tokenizer.tokenize, tokens)) # now, select correct labels according to subword length. Always select the first given label for each word. curr_working_idx = 0 labels = [] for subword_list in subwords: labels.append(bert_labels[curr_working_idx]) curr_working_idx += len(subword_list) assert len(tokens) == len(labels) return tokens, labels
[docs]def rejoin_bert_tokenized_sentence(sentence): """ original sentence is "The Smiths' used their son's car." tokenizer.basic_tokenizer.tokenize("The Smiths' used their son's car.") gives ['the', 'smiths', "'", 'used', 'their', 'son', "'", 's', 'car', '.'] fine_text returns "the smiths ' used their son ' s car ." tokenizer.basic_tokenizer.tokenize(fine_text) gives ['the', 'smiths', "'", 'used', 'their', 'son', "'", 's', 'car', '.'] again. """ text = ' '.join([x for x in sentence]) fine_text = text.replace(' ##', '') return fine_text