| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Python implementation of BLEU and smooth-BLEU. |
| |
| This module provides a Python implementation of BLEU and smooth-BLEU. |
| Smooth BLEU is computed following the method outlined in the paper: |
| Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic |
| evaluation metrics for machine translation. COLING 2004. |
| """ |
|
|
| import collections |
| import math |
|
|
|
|
| def _get_ngrams(segment, max_order): |
| """Extracts all n-grams upto a given maximum order from an input segment. |
| |
| Args: |
| segment: text segment from which n-grams will be extracted. |
| max_order: maximum length in tokens of the n-grams returned by this |
| methods. |
| |
| Returns: |
| The Counter containing all n-grams upto max_order in segment |
| with a count of how many times each n-gram occurred. |
| """ |
| ngram_counts = collections.Counter() |
| for order in range(1, max_order + 1): |
| for i in range(0, len(segment) - order + 1): |
| ngram = tuple(segment[i:i+order]) |
| ngram_counts[ngram] += 1 |
| return ngram_counts |
|
|
|
|
| def compute_bleu(reference_corpus, translation_corpus, max_order=4, |
| smooth=False): |
| """Computes BLEU score of translated segments against one or more references. |
| |
| Args: |
| reference_corpus: list of lists of references for each translation. Each |
| reference should be tokenized into a list of tokens. |
| translation_corpus: list of translations to score. Each translation |
| should be tokenized into a list of tokens. |
| max_order: Maximum n-gram order to use when computing BLEU score. |
| smooth: Whether or not to apply Lin et al. 2004 smoothing. |
| |
| Returns: |
| 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram |
| precisions and brevity penalty. |
| """ |
| matches_by_order = [0] * max_order |
| possible_matches_by_order = [0] * max_order |
| reference_length = 0 |
| translation_length = 0 |
| for (references, translation) in zip(reference_corpus, |
| translation_corpus): |
| reference_length += min(len(r) for r in references) |
| translation_length += len(translation) |
|
|
| merged_ref_ngram_counts = collections.Counter() |
| for reference in references: |
| merged_ref_ngram_counts |= _get_ngrams(reference, max_order) |
| translation_ngram_counts = _get_ngrams(translation, max_order) |
| overlap = translation_ngram_counts & merged_ref_ngram_counts |
| for ngram in overlap: |
| matches_by_order[len(ngram)-1] += overlap[ngram] |
| for order in range(1, max_order+1): |
| possible_matches = len(translation) - order + 1 |
| if possible_matches > 0: |
| possible_matches_by_order[order-1] += possible_matches |
|
|
| precisions = [0] * max_order |
| for i in range(0, max_order): |
| if smooth: |
| precisions[i] = ((matches_by_order[i] + 1.) / |
| (possible_matches_by_order[i] + 1.)) |
| else: |
| if possible_matches_by_order[i] > 0: |
| precisions[i] = (float(matches_by_order[i]) / |
| possible_matches_by_order[i]) |
| else: |
| precisions[i] = 0.0 |
|
|
| if min(precisions) > 0: |
| p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) |
| geo_mean = math.exp(p_log_sum) |
| else: |
| geo_mean = 0 |
|
|
| ratio = float(translation_length) / reference_length |
|
|
| if ratio > 1.0: |
| bp = 1. |
| else: |
| bp = math.exp(1 - 1. / ratio) |
|
|
| bleu = geo_mean * bp |
|
|
| return (bleu, precisions, bp, ratio, translation_length, reference_length) |
|
|
|
|
| def _bleu(ref_file, trans_file, subword_option=None): |
| max_order = 4 |
| smooth = True |
| ref_files = [ref_file] |
| reference_text = [] |
| for reference_filename in ref_files: |
| with open(reference_filename) as fh: |
| reference_text.append(fh.readlines()) |
| per_segment_references = [] |
| for references in zip(*reference_text): |
| reference_list = [] |
| for reference in references: |
| reference_list.append(reference.strip().split()) |
| per_segment_references.append(reference_list) |
| translations = [] |
| with open(trans_file) as fh: |
| for line in fh: |
| translations.append(line.strip().split()) |
| bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) |
| return round(100 * bleu_score,2) |