Source code for mzutils.nlp_tasks.ner_funcs

import numpy as np


[docs]def helper_flatten(list_of_lists):
    for list in list_of_lists:
        for item in list:
            yield item


[docs]def subword_tokenize_labels(tokens, labels, tokenizer, bert_special_tokens=True):
    """
    :param tokens: something like ['John', 'Johanson', 'lives', 'in', 'Ramat', 'Gan', 'Gang', '.']. can get from 
    tokens = tokenizer.basic_tokenizer.tokenize("John Johanson lives in Ramat Gan Gang.")
    tokens = nltk.word_tokenize("John Johanson lives in Ramat Gan Gang.")

    :param labels: [1, 2, 0, 0, 1, 2, 2, 0]. NER tokens. 
    labels:
    0 -> O out
    1 -> B beginning
    2 -> I continued
    3 -> X sub-words that are not tagged.
    
    :param tokenizer: e.g. transformers.BertTokenizer.from_pretrained('bert-base-cased')

    :param bert_special_tokens: add '[CLS]' and '[SEP]' or not.
    
    :return: (['[CLS]', 'john', 'johan', '##son', 'lives', 'in', 'rama', '##t', 'gan', 'gang', '.', '[SEP]'], [101, 2198, 13093, 3385, 3268, 1999, 14115, 2102, 25957, 6080, 1012, 102], array([ 1,  2,  4,  5,  6,  8,  9, 10]), [0, 1, 2, 3, 0, 0, 1, 3, 2, 2, 0, 0])
    """
    assert len(tokens) == len(labels)

    subwords = list(map(tokenizer.tokenize, tokens))  # subwords here is flattened.
    subword_lengths = list(map(len, subwords))
    subwords = list(helper_flatten(subwords))
    if bert_special_tokens:
        subwords = ['[CLS]'] + subwords + ['[SEP]']  # 
    token_start_idxs = 1 + np.cumsum([0] + subword_lengths[:-1])
    bert_labels = [[label] + (sublen - 1) * [3] for sublen, label in zip(subword_lengths, labels)]
    bert_labels = [0] + list(helper_flatten(bert_labels)) + [0]
    encoded_subwords = tokenizer.encode(subwords, add_special_tokens=False)
    assert len(subwords) == len(bert_labels)

    return subwords, encoded_subwords, token_start_idxs, bert_labels


[docs]def labels_from_subword_labels(tokens, bert_labels, tokenizer, bert_special_tokens=True):
    """
    :param tokens: something like ['John', 'Johanson', 'lives', 'in', 'Ramat', 'Gan', '.']. can get from 
    tokens = tokenizer.basic_tokenizer.tokenize("John Johanson lives in Ramat Gan.")
    tokens = nltk.word_tokenize("John Johanson lives in Ramat Gan.")

    :param bert_labels: [0, 1, 2, 3, 0, 0, 1, 3, 2, 2, 0, 0]. NER tokens from subword_tokenize_labels. 
    labels:
    0 -> O out
    1 -> B beginning
    2 -> I continued
    3 -> X sub-words that are not tagged.
    
    :param tokenizer: e.g. transformers.BertTokenizer.from_pretrained('bert-base-cased')

    :param bert_special_tokens: add '[CLS]' and '[SEP]' or not.
    
    :return: (['John', 'Johanson', 'lives', 'in', 'Ramat', 'Gan', 'Gang', '.'], [1, 2, 0, 0, 1, 2, 2, 0])
    """
    if bert_special_tokens:
        bert_labels = bert_labels[1:-1]  # remove '[CLS]' and '[SEP]'
    subwords = list(map(tokenizer.tokenize, tokens))
    # now, select correct labels according to subword length. Always select the first given label for each word.
    curr_working_idx = 0
    labels = []
    for subword_list in subwords:
        labels.append(bert_labels[curr_working_idx])
        curr_working_idx += len(subword_list)
    assert len(tokens) == len(labels)
    return tokens, labels


[docs]def rejoin_bert_tokenized_sentence(sentence):
    """
    original sentence is "The Smiths' used their son's car."
    tokenizer.basic_tokenizer.tokenize("The Smiths' used their son's car.") gives ['the', 'smiths', "'", 'used', 'their', 'son', "'", 's', 'car', '.']
    fine_text returns "the smiths ' used their son ' s car ."
    tokenizer.basic_tokenizer.tokenize(fine_text) gives ['the', 'smiths', "'", 'used', 'their', 'son', "'", 's', 'car', '.'] again.
    """
    text = ' '.join([x for x in sentence])
    fine_text = text.replace(' ##', '')
    return fine_text