import codecs
import json
import os
import mzutils.json_funcs
import mzutils.os_funcs
# ---------------------------------SQuAD 1.1 Functionss---------------------------------
# The structure looks like this:
# SQuAD:https://rajpurkar.github.io/SQuAD-explorer/
#
# file.json
# ├── "data"
# │ └── [i]
# │ ├── "paragraphs"
# │ │ └── [j]
# │ │ ├── "context": "paragraph text"
# │ │ └── "qas"
# │ │ └── [k]
# │ │ ├── "answers"
# │ │ │ └── [l]
# │ │ │ ├── "answer_start": N
# │ │ │ └── "text": "answer"
# │ │ ├── "id": "<uuid>"
# │ │ └── "question": "paragraph question?"
# │ └── "title": "document id"
# └── "version": 1.1
[docs]def generate_multi_test_cases(list_of_paragraphs, list_of_questions, json_store_path):
"""
given pairs of paragraphs and questions, it creates a json file just like how training/dev/test data stored in
SQuAD 1.1
:param list_of_paragraphs:
:param list_of_questions:
:param json_store_path:
:return:
"""
assert len(list_of_paragraphs) == len(list_of_questions)
length_of_them = len(list_of_paragraphs)
data = []
version = "1.1"
jsondict = {}
jsondict["data"] = data
jsondict["version"] = version
for j in range(length_of_them):
new_paragraph = {}
new_paragraph["context"] = list_of_paragraphs[j]
new_paragraph["qas"] = [{"answers": [{"answer_start": -1, "text": ""}], "question": list_of_questions[j],
"id": j}]
data.append({"title": "", "paragraphs": [new_paragraph]}) # here we can have multiple paragraph in paragraphs
with codecs.open(json_store_path, 'w+', encoding='utf-8') as fp:
json.dump(jsondict, fp)
[docs]def simple_squad_segmentor(squad_file_path, store_location, num_of_paragraphs=500):
mzutils.os_funcs.mkdir_p(store_location)
squad_file_name = mzutils.os_funcs.basename_and_extension(squad_file_path)[0]
squad_file_data = mzutils.json_funcs.load_config(squad_file_path)["data"]
epoch = len(squad_file_data) // num_of_paragraphs + 1
for i in range(epoch):
store_dict = {"data": squad_file_data[(i * num_of_paragraphs):((i + 1) * num_of_paragraphs)], "version": "1.1"}
with codecs.open(os.path.join(store_location, squad_file_name) + str(i) + ".json", 'w+',
encoding='utf-8') as fp:
json.dump(store_dict, fp)
# ---------------------------------TriviaQA Functions---------------------------------
# file.json
# ├── [{}] "Data"
# │ ├── {} "Answer"
# │ │ └── [] "Aliases"
# │ │ └── [] "NormalizedAliases"
# │ │ └── "NormalizedValue"
# │ ├── "Question"
# │ └── "QuestionId"
# other useless rows omitted.
[docs]def retrieve_questions_from_triviaQA(file_path, destination_path=None):
"""
:param file_path:
:return:[{"Question" : "", "QuestionId" : "", "AcceptableAnswers" : ""}]
or
None and write {"data": [{"Question" : "", "QuestionId" : "", "AcceptableAnswers" : ""}]}
"""
return_list = []
data_list = mzutils.json_funcs.load_config(file_path)["Data"]
for data in data_list:
AcceptableAnswers = data["Answer"]["Aliases"] + data["Answer"]["NormalizedAliases"] + [
data["Answer"]["NormalizedValue"]]
return_list.append(
{"question": data["Question"], "questionid": data["QuestionId"], "acceptableanswers": AcceptableAnswers})
if not destination_path:
return return_list
else:
mzutils.json_funcs.dump_config(destination_path, {"data": return_list})
[docs]def generate_multi_test_cases_triviaQA(retrieved_json_path, json_store_path, documents_path, missing_file_path=None):
"""
given pairs of paragraphs and questions, it creates a json file just like how training/dev/test data stored in
SQuAD 1.1
the format is :
{
"[question_order, answer_order, "qid", ["ground_truths"]] : "ans",
"[0, 0, 'tc_1250', ['The Swiss Miss', 'Martina hingis', 'Martina Hingisov\u00e1', 'Martina Hingis', 'MartinaHingis', 'Martina Hingisova', 'Hingis', 'hingis', 'swiss miss', 'martina hingis', 'martina hingisova', 'martinahingis', 'martina hingisov\u00e1', 'martina hingis']]": "Li Na",
}
"""
retrieved_list = mzutils.json_funcs.load_config(retrieved_json_path)['data']
missing_files = []
data = []
version = "1.1"
jsondict = {}
jsondict["data"] = data
jsondict["version"] = version
j = -1
for i, retrieved_data in enumerate(
retrieved_list): # i:question number from 0; j: number of question|answer pairs from 0
if i % 500 == 0:
print(str(i) + " questions formatted ... ")
question = retrieved_data["question"]
questionid = retrieved_data["questionid"]
acceptableanswers = retrieved_data["acceptableanswers"]
documents = retrieved_data["documents"]
for document_name in documents:
doc_path = os.path.join(documents_path, document_name)
if not os.path.exists(doc_path):
missing_files.append(document_name)
else:
j += 1
with codecs.open(doc_path, 'r', encoding='utf8') as fp:
document_content = fp.read()
new_paragraph = {}
new_paragraph["context"] = document_content
new_paragraph["qas"] = [{"answers": [{"answer_start": -1, "text": ""}], "question": question,
"id": str([i, j, questionid, acceptableanswers])}]
data.append(
{"title": "", "paragraphs": [new_paragraph]}) # here we can have multiple paragraph in paragraphs
with codecs.open(json_store_path, 'w+', encoding='utf-8') as fp:
json.dump(jsondict, fp)
if missing_file_path:
with codecs.open(missing_file_path, 'w+', encoding='utf-8') as fp:
json.dump(missing_files, fp)
# ---------------------------------TriviaQA Evaluation Functions---------------------------------
[docs]def concatenate_predictions_dicts(squadjsons_files_dir, output_file=None):
"""concatenate_predictions_dicts.
:param squadjsons_files_dir: should be one of "wikipedia-train" "wikipedia-dev" "web-train" "web-dev" "verified-web-dev" "verified-wikipedia-dev"
it is actually a directory with format:
squadjsons_files_dir
├── squadjsons%d
│ └── predictions.json
:param outout_dir: directory to store concatenated predictions.json
:return: None
"""
if not output_file:
output_file = os.path.join(squadjsons_files_dir, "predictions.json")
output_dict = {}
dir_num = 0
while True:
squadjsonsnum_dir = os.path.join(squadjsons_files_dir,
"squadjsons" + str(dir_num)) # this is the path of directory squadjsons%d
if not os.path.isdir(squadjsonsnum_dir):
break
output_dict.update(mzutils.json_funcs.load_config(os.path.join(squadjsonsnum_dir, "predictions.json")))
dir_num += 1
mzutils.json_funcs.dump_config(output_file, output_dict)