Source code for mzutils.nlp_tasks.data_preprocessing

import codecs
import json
import os

import mzutils.json_funcs
import mzutils.os_funcs


# ---------------------------------SQuAD 1.1 Functionss---------------------------------

# The structure looks like this:
# SQuAD:https://rajpurkar.github.io/SQuAD-explorer/
#
# file.json
# ├── "data"
# │   └── [i]
# │       ├── "paragraphs"
# │       │   └── [j]
# │       │       ├── "context": "paragraph text"
# │       │       └── "qas"
# │       │           └── [k]
# │       │               ├── "answers"
# │       │               │   └── [l]
# │       │               │       ├── "answer_start": N
# │       │               │       └── "text": "answer"
# │       │               ├── "id": "<uuid>"
# │       │               └── "question": "paragraph question?"
# │       └── "title": "document id"
# └── "version": 1.1


[docs]def generate_multi_test_cases(list_of_paragraphs, list_of_questions, json_store_path): """ given pairs of paragraphs and questions, it creates a json file just like how training/dev/test data stored in SQuAD 1.1 :param list_of_paragraphs: :param list_of_questions: :param json_store_path: :return: """ assert len(list_of_paragraphs) == len(list_of_questions) length_of_them = len(list_of_paragraphs) data = [] version = "1.1" jsondict = {} jsondict["data"] = data jsondict["version"] = version for j in range(length_of_them): new_paragraph = {} new_paragraph["context"] = list_of_paragraphs[j] new_paragraph["qas"] = [{"answers": [{"answer_start": -1, "text": ""}], "question": list_of_questions[j], "id": j}] data.append({"title": "", "paragraphs": [new_paragraph]}) # here we can have multiple paragraph in paragraphs with codecs.open(json_store_path, 'w+', encoding='utf-8') as fp: json.dump(jsondict, fp)
[docs]def simple_squad_segmentor(squad_file_path, store_location, num_of_paragraphs=500): mzutils.os_funcs.mkdir_p(store_location) squad_file_name = mzutils.os_funcs.basename_and_extension(squad_file_path)[0] squad_file_data = mzutils.json_funcs.load_config(squad_file_path)["data"] epoch = len(squad_file_data) // num_of_paragraphs + 1 for i in range(epoch): store_dict = {"data": squad_file_data[(i * num_of_paragraphs):((i + 1) * num_of_paragraphs)], "version": "1.1"} with codecs.open(os.path.join(store_location, squad_file_name) + str(i) + ".json", 'w+', encoding='utf-8') as fp: json.dump(store_dict, fp)
# ---------------------------------TriviaQA Functions--------------------------------- # file.json # ├── [{}] "Data" # │ ├── {} "Answer" # │ │ └── [] "Aliases" # │ │ └── [] "NormalizedAliases" # │ │ └── "NormalizedValue" # │ ├── "Question" # │ └── "QuestionId" # other useless rows omitted.
[docs]def retrieve_questions_from_triviaQA(file_path, destination_path=None): """ :param file_path: :return:[{"Question" : "", "QuestionId" : "", "AcceptableAnswers" : ""}] or None and write {"data": [{"Question" : "", "QuestionId" : "", "AcceptableAnswers" : ""}]} """ return_list = [] data_list = mzutils.json_funcs.load_config(file_path)["Data"] for data in data_list: AcceptableAnswers = data["Answer"]["Aliases"] + data["Answer"]["NormalizedAliases"] + [ data["Answer"]["NormalizedValue"]] return_list.append( {"question": data["Question"], "questionid": data["QuestionId"], "acceptableanswers": AcceptableAnswers}) if not destination_path: return return_list else: mzutils.json_funcs.dump_config(destination_path, {"data": return_list})
[docs]def generate_multi_test_cases_triviaQA(retrieved_json_path, json_store_path, documents_path, missing_file_path=None): """ given pairs of paragraphs and questions, it creates a json file just like how training/dev/test data stored in SQuAD 1.1 the format is : { "[question_order, answer_order, "qid", ["ground_truths"]] : "ans", "[0, 0, 'tc_1250', ['The Swiss Miss', 'Martina hingis', 'Martina Hingisov\u00e1', 'Martina Hingis', 'MartinaHingis', 'Martina Hingisova', 'Hingis', 'hingis', 'swiss miss', 'martina hingis', 'martina hingisova', 'martinahingis', 'martina hingisov\u00e1', 'martina hingis']]": "Li Na", } """ retrieved_list = mzutils.json_funcs.load_config(retrieved_json_path)['data'] missing_files = [] data = [] version = "1.1" jsondict = {} jsondict["data"] = data jsondict["version"] = version j = -1 for i, retrieved_data in enumerate( retrieved_list): # i:question number from 0; j: number of question|answer pairs from 0 if i % 500 == 0: print(str(i) + " questions formatted ... ") question = retrieved_data["question"] questionid = retrieved_data["questionid"] acceptableanswers = retrieved_data["acceptableanswers"] documents = retrieved_data["documents"] for document_name in documents: doc_path = os.path.join(documents_path, document_name) if not os.path.exists(doc_path): missing_files.append(document_name) else: j += 1 with codecs.open(doc_path, 'r', encoding='utf8') as fp: document_content = fp.read() new_paragraph = {} new_paragraph["context"] = document_content new_paragraph["qas"] = [{"answers": [{"answer_start": -1, "text": ""}], "question": question, "id": str([i, j, questionid, acceptableanswers])}] data.append( {"title": "", "paragraphs": [new_paragraph]}) # here we can have multiple paragraph in paragraphs with codecs.open(json_store_path, 'w+', encoding='utf-8') as fp: json.dump(jsondict, fp) if missing_file_path: with codecs.open(missing_file_path, 'w+', encoding='utf-8') as fp: json.dump(missing_files, fp)
# ---------------------------------TriviaQA Evaluation Functions---------------------------------
[docs]def concatenate_predictions_dicts(squadjsons_files_dir, output_file=None): """concatenate_predictions_dicts. :param squadjsons_files_dir: should be one of "wikipedia-train" "wikipedia-dev" "web-train" "web-dev" "verified-web-dev" "verified-wikipedia-dev" it is actually a directory with format: squadjsons_files_dir ├── squadjsons%d │ └── predictions.json :param outout_dir: directory to store concatenated predictions.json :return: None """ if not output_file: output_file = os.path.join(squadjsons_files_dir, "predictions.json") output_dict = {} dir_num = 0 while True: squadjsonsnum_dir = os.path.join(squadjsons_files_dir, "squadjsons" + str(dir_num)) # this is the path of directory squadjsons%d if not os.path.isdir(squadjsonsnum_dir): break output_dict.update(mzutils.json_funcs.load_config(os.path.join(squadjsonsnum_dir, "predictions.json"))) dir_num += 1 mzutils.json_funcs.dump_config(output_file, output_dict)