import copy
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu
import torch
import numpy as np
[docs]def compute_sentence_pseudo_mlm_perplexity(model, tokenizer, sentence: str, mask_token: str = '[MASK]', max_length: int = 512, empty_cache: bool = False, batch_size=64):
"""Compute perplexity of a sentence using pseudo MLM.
contrary to https://huggingface.co/docs/transformers/perplexity, we use
diagonal masking to compute the model confusion.
Args:
model (_type_): e.g. BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer (_type_): e.g. BertTokenizer.from_pretrained('bert-base-uncased')
sentence (str): _description_
mask_token (str, optional): _description_
max_length (int, optional): _description_
empty_cache (bool, optional): Default to False.
batch_size (int, optional): _description_
Returns:
_type_: _description_
"""
tensor_input = tokenizer.encode(sentence, return_tensors='pt', max_length=max_length, truncation=True) # [CLS], setence, [SEP]
repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
masked_input = repeat_input.masked_fill(mask == 1, tokenizer.convert_tokens_to_ids('[MASK]'))
# Using -100 to ignore the tokens not included in the loss computing. So we just compute over the cared tokens.
labels = repeat_input.masked_fill( masked_input != tokenizer.convert_tokens_to_ids('[MASK]'), -100)
with torch.inference_mode():
masked_input = masked_input.to(model.device)
labels = labels.to(model.device)
total_loss = 0.
for i in range(masked_input.shape[0]//batch_size + 1):
curr_batch_len = masked_input[i * batch_size:(i+1) * batch_size].shape[0]
if curr_batch_len > 0:
curr_loss = model(masked_input[i * batch_size:(i+1) * batch_size], labels=labels[i*batch_size:(i+1)*batch_size]).loss
total_loss += (curr_loss * curr_batch_len).item()
# loss,_ = model(masked_input, masked_lm_labels=labels) # this is for older version of transformers
result = np.exp(total_loss / masked_input.shape[0])
if model.device.type == 'cuda' and empty_cache:
del masked_input
del labels
torch.cuda.empty_cache()
return result
[docs]def rouge_helper_prepare_results(m, p, r, f):
return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)
[docs]def remove_sub_strings(predicted_txt, tokens=['ᐛ ', ' ✬', '<unk>']):
"""
remove the list of strings (tokens) from predicted_txt
"""
for token in tokens:
predicted_txt = predicted_txt.replace(token, "")
return predicted_txt
[docs]def remove_sub_strings_chinese(predicted_txt, tokens=['ᐛ', '✬', '<unk>']):
"""
remove the list of strings (tokens) from predicted_txt
"""
for token in tokens:
predicted_txt = predicted_txt.replace(token, "")
return predicted_txt
[docs]def translation_paraphrase_evaluation_english_tagpa(sources, hypos, refs, print_scores=True, max_n=4, rouge_alpha=0.5,
rouge_weight_factor=1.2, rouge_stemming=True):
"""
to evalute generated paraphrase or translations with BlEU and ROUGE scores.
Nothing should be tokenized here.
:param sources: source sentence to start with. e.g. ['Young woman with sheep on straw covered floor .', 'A man who is walking across the street .']
:param hypos: generated hypotheses. should share the same shape with sources. (each source, generate one list of hypothesis sentence.) e.g. ['Young woman with sheep on straw covered floor .', 'a little girl with sheep on straw covered floor .'] for 'Young woman with sheep on straw covered floor .'
:param refs: list of list of sentences. For each source, given a list of possible references. e.g. [['Young woman with sheep on straw covered floor .', 'Young woman on the floor .'] ['A man who is walking across the street now.', 'A man walking across the street.']]
:return: a dictionary of scores.
"""
import rouge # pip install git+https://github.com/Mohan-Zhang-u/py-rouge.git
sources_refs = [[sentence] for sentence in
sources] # we use source as the reference to compute a negative score, in order to measure the diversity of paraphrasing.
metrics_dict = {}
for aggregator in ['Avg', 'Best']:
apply_avg = aggregator == 'Avg'
apply_best = aggregator == 'Best'
evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
max_n=max_n,
apply_avg=apply_avg,
apply_best=apply_best,
alpha=rouge_alpha, # Default F1_score
weight_factor=rouge_weight_factor,
stemming=rouge_stemming)
compare_dict = {'hypos': hypos, 'sources': sources, 'sources_refs_diversity_negative': hypos}
for key in compare_dict:
if key == 'sources_refs_diversity_negative':
scores = evaluator.get_scores(compare_dict[key], sources_refs)
else:
scores = evaluator.get_scores(compare_dict[key], refs)
metrics_dict[key + '_rouge_' + aggregator] = scores
if print_scores:
print('Evaluation with {} with {}'.format(key, aggregator))
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
for hypothesis_id, results_per_ref in enumerate(results):
nb_references = len(results_per_ref['p'])
for reference_id in range(nb_references):
print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
print('\t' + rouge_helper_prepare_results(metric, results_per_ref['p'][reference_id],
results_per_ref['r'][reference_id],
results_per_ref['f'][reference_id]))
print()
else:
print(rouge_helper_prepare_results(metric, results['p'], results['r'], results['f']))
print()
bleu_sources = []
for source in sources:
bleu_sources.append(word_tokenize(source))
bleu_hypos = []
for hypo in hypos:
bleu_hypos.append(word_tokenize(hypo))
bleu_refs = copy.deepcopy(refs)
for sub_ref in bleu_refs:
for i in range(len(sub_ref)):
sub_ref[i] = word_tokenize(sub_ref[i])
for sources_ref in sources_refs:
for i in range(len(sources_ref)):
sources_ref[i] = word_tokenize(sources_ref[i])
# metrics_dict["bleu_no_weights"] = corpus_bleu(refs, hypos)
metrics_dict["bleu_1"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(1, 0, 0, 0))
metrics_dict["bleu_2"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.5, 0.5, 0, 0))
metrics_dict["bleu_3"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.33, 0.33, 0.34, 0))
metrics_dict["bleu_4"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.25, 0.25, 0.25, 0.25))
metrics_dict["source_sentence_bleu_1"] = corpus_bleu(bleu_refs, bleu_sources, weights=(1, 0, 0, 0))
metrics_dict["source_sentence_bleu_2"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.5, 0.5, 0, 0))
metrics_dict["source_sentence_bleu_3"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.33, 0.33, 0.34, 0))
metrics_dict["source_sentence_bleu_4"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.25, 0.25, 0.25, 0.25))
metrics_dict["sources_as_refs_diversity_negative_bleu_1"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(1, 0, 0, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_2"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(0.5, 0.5, 0, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_3"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(0.33, 0.33, 0.34, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_4"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(0.25, 0.25, 0.25, 0.25))
if print_scores:
for sc in ["bleu_1", "bleu_2", "bleu_3", "bleu_4", "source_sentence_bleu_1", "source_sentence_bleu_2",
"source_sentence_bleu_3", "source_sentence_bleu_4", "sources_as_refs_diversity_negative_bleu_1",
"sources_as_refs_diversity_negative_bleu_2", "sources_as_refs_diversity_negative_bleu_3",
"sources_as_refs_diversity_negative_bleu_4"]:
print(sc, "(percents):", round(metrics_dict[sc], 4) * 100)
return metrics_dict
[docs]def translation_paraphrase_evaluation(sources, hypos, refs, sentence_preproce_function=None, print_scores=True, max_n=4,
rouge_alpha=0.5, rouge_weight_factor=1.2, rouge_stemming=True,
hypo_style='first'):
"""
to evalute generated paraphrase or translations with BlEU and ROUGE scores.
Nothing should be tokenized here.
:param sources: source sentence to start with. e.g. sources = ['Young woman with sheep on straw covered floor.', 'A man who is walking across the street.', 'A brightly lit kitchen with lots of natural light.']
:param hypos: generated hypotheses. should share the same shape with sources. (each source, generate one list of hypothesis sentence.) e.g. [['A child places his hands on the head and neck of a sheep while another sheep looks at his face.', 'A person petting the head of a cute fluffy sheep.', 'A child is petting a sheep while another sheep watches.', 'A woman kneeling to pet animals while others wait. '], ['A busy intersection with an ice cream truck driving by.', 'a man walks behind an ice cream truck ', 'A man is crossing a street near an icecream truck.', 'The man is walking behind the concession bus.'], ['A modern kitchen in white with stainless steel lights.', 'A kitchen filled with lots of white counter space.', 'A KITCHEN IN THE ROOM WITH WHITE APPLIANCES ', 'A modern home kitchen and sitting area looking out towards the back yard']]
:param refs: list of list of sentences. For each source, given a list of possible references. e.g. [['A woman standing next to a sheep in a pen .<unk>', 'A woman standing next to a sheep on a farm .<unk>', 'A woman standing next to a sheep in a barn .<unk>', 'A woman standing next to a sheep in a field .<unk>', 'A woman standing next to a sheep in a barn<unk>'], ['A man crossing the street in front of a store .<unk>', 'A man crossing the street in a city .<unk>', 'A person crossing the street in a city .<unk>', 'A man crossing the street in the middle of a city<unk>', 'A man crossing the street in the middle of a city street<unk>'], ['a kitchen with a stove a microwave and a sink<unk>', 'a kitchen with a stove a sink and a microwave<unk>', 'a kitchen with a stove a sink and a refrigerator<unk>', 'A kitchen with a sink , stove , microwave and window .<unk>', 'a kitchen with a stove a sink and a window<unk>']]
:param hypo_style: how to evaluate the generated hypotheses. Pick the first? Choose the one with best evalution score? Average the scores on all hypotheses? Should be one of ['first', 'best', 'average']
:param sentence_preproce_function: a function that will be applied to all sentences in sources, hypos, refs
:return: a dictionary of scores.
"""
import rouge # pip install git+https://github.com/Mohan-Zhang-u/py-rouge.git
assert (isinstance(sources, list))
assert (isinstance(sources[0], str))
assert (isinstance(hypos, list))
assert (isinstance(hypos[0], list))
assert (isinstance(hypos[0][0], str))
assert (isinstance(refs, list))
assert (isinstance(refs[0], list))
assert (isinstance(refs[0][0], str))
if hypo_style == 'first':
hypos = [hypo[0] for hypo in hypos]
else:
raise NotImplementedError
# apply sentence_preproce_function, e.g. remove_tokens
if sentence_preproce_function is not None:
sources = [sentence_preproce_function(source) for source in sources]
if hypo_style == 'first':
hypos = [sentence_preproce_function(hypo) for hypo in hypos]
else:
raise NotImplementedError
hypos = [[sentence_preproce_function(hypo) for hypo in hypo_list] for hypo_list in hypos]
refs = [[sentence_preproce_function(ref) for ref in refs_list] for refs_list in refs]
sources_refs = [[sentence] for sentence in
sources] # we use source as the reference to compute a negative score, in order to measure the diversity of paraphrasing.
metrics_dict = {}
for aggregator in ['Avg', 'Best']:
apply_avg = aggregator == 'Avg'
apply_best = aggregator == 'Best'
evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
max_n=max_n,
apply_avg=apply_avg,
apply_best=apply_best,
alpha=rouge_alpha, # Default F1_score
weight_factor=rouge_weight_factor,
stemming=rouge_stemming)
compare_dict = {'hypos': hypos, 'sources': sources, 'sources_refs_diversity_negative': hypos}
for key in compare_dict:
if key == 'sources_refs_diversity_negative':
scores = evaluator.get_scores(compare_dict[key], sources_refs)
else:
scores = evaluator.get_scores(compare_dict[key], refs)
metrics_dict[key + '_rouge_' + aggregator] = scores
if print_scores:
print('Evaluation with {} with {}'.format(key, aggregator))
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
for hypothesis_id, results_per_ref in enumerate(results):
nb_references = len(results_per_ref['p'])
for reference_id in range(nb_references):
print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
print('\t' + rouge_helper_prepare_results(metric, results_per_ref['p'][reference_id],
results_per_ref['r'][reference_id],
results_per_ref['f'][reference_id]))
print()
else:
print(rouge_helper_prepare_results(metric, results['p'], results['r'], results['f']))
print()
bleu_sources = []
for source in sources:
bleu_sources.append(word_tokenize(source))
bleu_hypos = []
if hypo_style == 'first':
for hypo in hypos:
bleu_hypos.append(word_tokenize(hypo))
else:
raise NotImplementedError
bleu_hypos = copy.deepcopy(hypos)
for sub_hypo in bleu_hypos:
for i in range(len(sub_hypo)):
sub_hypo[i] = word_tokenize(sub_hypo[i])
bleu_refs = copy.deepcopy(refs)
for sub_ref in bleu_refs:
for i in range(len(sub_ref)):
sub_ref[i] = word_tokenize(sub_ref[i])
for sources_ref in sources_refs:
for i in range(len(sources_ref)):
sources_ref[i] = word_tokenize(sources_ref[i])
# print(corpus_bleu(bleu_refs, bleu_hypos, weights=(1, 0, 0, 0)))
# return
metrics_dict["bleu_1"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(1, 0, 0, 0))
metrics_dict["bleu_2"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.5, 0.5, 0, 0))
metrics_dict["bleu_3"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.33, 0.33, 0.34, 0))
metrics_dict["bleu_4"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.25, 0.25, 0.25, 0.25))
metrics_dict["source_sentence_bleu_1"] = corpus_bleu(bleu_refs, bleu_sources, weights=(1, 0, 0, 0))
metrics_dict["source_sentence_bleu_2"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.5, 0.5, 0, 0))
metrics_dict["source_sentence_bleu_3"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.33, 0.33, 0.34, 0))
metrics_dict["source_sentence_bleu_4"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.25, 0.25, 0.25, 0.25))
metrics_dict["sources_as_refs_diversity_negative_bleu_1"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(1, 0, 0, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_2"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(0.5, 0.5, 0, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_3"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(0.33, 0.33, 0.34, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_4"] = corpus_bleu(sources_refs, bleu_hypos,
weights=(0.25, 0.25, 0.25, 0.25))
if print_scores:
for sc in ["bleu_1", "bleu_2", "bleu_3", "bleu_4", "source_sentence_bleu_1", "source_sentence_bleu_2",
"source_sentence_bleu_3", "source_sentence_bleu_4", "sources_as_refs_diversity_negative_bleu_1",
"sources_as_refs_diversity_negative_bleu_2", "sources_as_refs_diversity_negative_bleu_3",
"sources_as_refs_diversity_negative_bleu_4"]:
print(sc, "(percents):", round(metrics_dict[sc], 4) * 100)
return metrics_dict
[docs]def translation_paraphrase_evaluation_chinese(sources, hypos, refs, sentence_preproce_function=None, print_scores=True,
max_n=4, rouge_alpha=0.5, rouge_weight_factor=1.2, rouge_stemming=True,
hypo_style='first', word_segmentor='character'):
"""
to evalute generated paraphrase or translations with BlEU and ROUGE scores.
Nothing should be tokenized here.
:param sources: source sentence to start with.
:param hypos: generated hypotheses. should share the same shape with sources. (each source, generate one list of hypothesis sentence.)
:param refs: list of list of sentences. For each source, given a list of possible references.
:param hypo_style: how to evaluate the generated hypotheses. Pick the first? Choose the one with best evalution score? Average the scores on all hypotheses? Should be one of ['first', 'best', 'average']
:param sentence_preproce_function: a function that will be applied to all sentences in sources, hypos, refs
:param word_segmentor: 'character' means seperate each character to be a word, 'hanlp' means an hanlp chinese tokenizer.
:return: a dictionary of scores.
"""
import rouge # pip install git+https://github.com/Mohan-Zhang-u/py-rouge.git
assert (isinstance(sources, list))
assert (isinstance(sources[0], str))
assert (isinstance(hypos, list))
assert (isinstance(hypos[0], list))
assert (isinstance(hypos[0][0], str))
assert (isinstance(refs, list))
assert (isinstance(refs[0], list))
assert (isinstance(refs[0][0], str))
# apply sentence_preproce_function, e.g. remove_tokens
if sentence_preproce_function is not None:
sources = [sentence_preproce_function(source) for source in sources]
hypos = [[sentence_preproce_function(hypo) for hypo in hypo_list] for hypo_list in hypos]
refs = [[sentence_preproce_function(ref) for ref in refs_list] for refs_list in refs]
sources_refs = [[sentence] for sentence in
sources] # we use source as the reference to compute a negative score, in order to measure the diversity of paraphrasing.
metrics_dict = {}
# tokenize chinese sentences.
if word_segmentor == 'character':
sources = [' '.join(source) for source in sources]
refs = [[' '.join(ref) for ref in ref_list] for ref_list in refs]
sources_refs = [[' '.join(ref) for ref in ref_list] for ref_list in sources_refs]
hypos = [[' '.join(hypo) for hypo in hypo_list] for hypo_list in hypos]
def word_tokenize(sentence):
return sentence.split(' ')
bleu_sources = []
for source in sources:
bleu_sources.append(word_tokenize(source))
bleu_hypos = copy.deepcopy(hypos)
for sub_hypo in bleu_hypos:
for i in range(len(sub_hypo)):
sub_hypo[i] = word_tokenize(sub_hypo[i])
bleu_refs = copy.deepcopy(refs)
for sub_ref in bleu_refs:
for i in range(len(sub_ref)):
sub_ref[i] = word_tokenize(sub_ref[i])
bleu_sources_refs = copy.deepcopy(sources_refs)
for sources_ref in bleu_sources_refs:
for i in range(len(sources_ref)):
sources_ref[i] = word_tokenize(sources_ref[i])
elif word_segmentor == 'hanlp':
import hanlp
word_tokenize = hanlp.load('LARGE_ALBERT_BASE')
bleu_sources = []
for source in sources:
bleu_sources.append(word_tokenize(source))
bleu_hypos = copy.deepcopy(hypos)
for sub_hypo in bleu_hypos:
for i in range(len(sub_hypo)):
sub_hypo[i] = word_tokenize(sub_hypo[i])
bleu_refs = copy.deepcopy(refs)
for sub_ref in bleu_refs:
for i in range(len(sub_ref)):
sub_ref[i] = word_tokenize(sub_ref[i])
bleu_sources_refs = copy.deepcopy(sources_refs)
for bleu_sources_ref in bleu_sources_refs:
for i in range(len(bleu_sources_ref)):
bleu_sources_ref[i] = word_tokenize(bleu_sources_ref[i])
sources = [' '.join(source) for source in bleu_sources]
refs = [[' '.join(ref) for ref in ref_list] for ref_list in bleu_refs]
sources_refs = [[' '.join(ref) for ref in ref_list] for ref_list in bleu_sources_refs]
hypos = [[' '.join(hypo) for hypo in hypo_list] for hypo_list in bleu_hypos]
if hypo_style == 'first':
hypos = [hypo[0] for hypo in hypos]
bleu_hypos = [hypo[0] for hypo in bleu_hypos]
else:
raise NotImplementedError
for aggregator in ['Avg', 'Best']:
apply_avg = aggregator == 'Avg'
apply_best = aggregator == 'Best'
evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
max_n=max_n,
apply_avg=apply_avg,
apply_best=apply_best,
alpha=rouge_alpha, # Default F1_score
weight_factor=rouge_weight_factor,
stemming=rouge_stemming,
language='chinese')
compare_dict = {'hypos': hypos, 'sources': sources, 'sources_refs_diversity_negative': hypos}
for key in compare_dict:
if key == 'sources_refs_diversity_negative':
scores = evaluator.get_scores(compare_dict[key], sources_refs)
else:
scores = evaluator.get_scores(compare_dict[key], refs)
metrics_dict[key + '_rouge_' + aggregator] = scores
if print_scores:
print('Evaluation with {} with {}'.format(key, aggregator))
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
for hypothesis_id, results_per_ref in enumerate(results):
nb_references = len(results_per_ref['p'])
for reference_id in range(nb_references):
print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
print('\t' + rouge_helper_prepare_results(metric, results_per_ref['p'][reference_id],
results_per_ref['r'][reference_id],
results_per_ref['f'][reference_id]))
print()
else:
print(rouge_helper_prepare_results(metric, results['p'], results['r'], results['f']))
print()
metrics_dict["bleu_1"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(1, 0, 0, 0))
metrics_dict["bleu_2"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.5, 0.5, 0, 0))
metrics_dict["bleu_3"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.33, 0.33, 0.34, 0))
metrics_dict["bleu_4"] = corpus_bleu(bleu_refs, bleu_hypos, weights=(0.25, 0.25, 0.25, 0.25))
metrics_dict["source_sentence_bleu_1"] = corpus_bleu(bleu_refs, bleu_sources, weights=(1, 0, 0, 0))
metrics_dict["source_sentence_bleu_2"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.5, 0.5, 0, 0))
metrics_dict["source_sentence_bleu_3"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.33, 0.33, 0.34, 0))
metrics_dict["source_sentence_bleu_4"] = corpus_bleu(bleu_refs, bleu_sources, weights=(0.25, 0.25, 0.25, 0.25))
metrics_dict["sources_as_refs_diversity_negative_bleu_1"] = corpus_bleu(bleu_sources_refs, bleu_hypos,
weights=(1, 0, 0, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_2"] = corpus_bleu(bleu_sources_refs, bleu_hypos,
weights=(0.5, 0.5, 0, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_3"] = corpus_bleu(bleu_sources_refs, bleu_hypos,
weights=(0.33, 0.33, 0.34, 0))
metrics_dict["sources_as_refs_diversity_negative_bleu_4"] = corpus_bleu(bleu_sources_refs, bleu_hypos,
weights=(0.25, 0.25, 0.25, 0.25))
if print_scores:
for sc in ["bleu_1", "bleu_2", "bleu_3", "bleu_4", "source_sentence_bleu_1", "source_sentence_bleu_2",
"source_sentence_bleu_3", "source_sentence_bleu_4", "sources_as_refs_diversity_negative_bleu_1",
"sources_as_refs_diversity_negative_bleu_2", "sources_as_refs_diversity_negative_bleu_3",
"sources_as_refs_diversity_negative_bleu_4"]:
print(sc, "(percents):", round(metrics_dict[sc], 4) * 100)
return metrics_dict