From e6a3c1020cf3271dd7b8f110c9143b57a2c656ee Mon Sep 17 00:00:00 2001
From: Sina <s.j.semnani@gmail.com>
Date: Sun, 14 Feb 2021 15:39:09 -0800
Subject: [PATCH 1/3] Ignore quoted sentences during paraphrasing

---
 genienlp/paraphrase/data_utils.py             |  4 +-
 .../paraphrase/scripts/transform_dataset.py   | 17 +++-
 genienlp/tasks/almond_task.py                 | 79 ++++++++++++++++++-
 3 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/genienlp/paraphrase/data_utils.py b/genienlp/paraphrase/data_utils.py
index e5f97886..61de20f1 100644
--- a/genienlp/paraphrase/data_utils.py
+++ b/genienlp/paraphrase/data_utils.py
@@ -28,8 +28,8 @@ special_pattern_mapping = [
                                                           ['$13', 'thirteen dollars', '13 dollars', '$ 13', '$ 13.00', '13.00', '13']]),
     SpecialTokenMap('DURATION_([0-9]+)', ['5 weeks', '6 weeks'], [['5 weeks', 'five weeks'], ['6 weeks', 'six weeks']]),
     SpecialTokenMap('LOCATION_([0-9]+)', ['locatio1n', 'locatio2n'], [['locatio1n', 'locat1n'], ['locatio2n', 'locat2n']]),
-    SpecialTokenMap('QUOTED_STRING_([0-9]+)', lambda x: 'Chinese', lambda x: ['Chinese', 'chinese', 'china']), # TODO change to be more general than cuisine
-    SpecialTokenMap('GENERIC_ENTITY_uk.ac.cam.multiwoz.Restaurant:Restaurant_([0-9]+)', ["restaurant1", "restaurant2", "restaurant3"]) # TODO the only reason we can get away with this unnatural replacement is that actual backward is not going to be called for this
+    # SpecialTokenMap('QUOTED_STRING_([0-9]+)', ['Chinese', 'Italian'], [['Chinese', 'chinese', 'china'], ['Italian', 'italian']]), # TODO change to be more general than cuisine
+    # SpecialTokenMap('GENERIC_ENTITY_uk.ac.cam.multiwoz.Restaurant:Restaurant_([0-9]+)', ["restaurant1", "restaurant2", "restaurant3"]) # TODO the only reason we can get away with this unnatural replacement is that actual backward is not going to be called for this
 ]
 
 
diff --git a/genienlp/paraphrase/scripts/transform_dataset.py b/genienlp/paraphrase/scripts/transform_dataset.py
index e4016e1a..494d44e3 100644
--- a/genienlp/paraphrase/scripts/transform_dataset.py
+++ b/genienlp/paraphrase/scripts/transform_dataset.py
@@ -10,7 +10,16 @@ def is_subset(set1, set2):
     """
     return all([e in set2 for e in set1])
 
-def passes_heuristic_checks(row, args):
+def passes_heuristic_checks(row, args, old_query=None):
+    if 'QUOTED_' in row[args.utterance_column]:
+        # remove quoted examples
+        return False
+    if old_query is not None:
+        old_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', old_query))
+        new_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', row[args.utterance_column]))
+        # check that all the special tokens in utterance after paraphrasing are the same as before
+        if set(old_special_tokens) != set(new_special_tokens):
+            return False
     all_input_columns = ' '.join([row[c] for c in args.input_columns])
     input_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', all_input_columns))
     output_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', row[args.thingtalk_column]))
@@ -110,6 +119,7 @@ def main(args):
             seen_examples = set()
         all_thrown_away_rows = []
         for row_idx, row in enumerate(progress_bar(reader, desc='Lines')):
+            old_query = None
             output_rows = []
             thrown_away_rows = []
             if args.transformation == 'remove_thingtalk_quotes':
@@ -136,6 +146,7 @@ def main(args):
                     row[args.utterance_column] = new_query
                     output_rows.append(row)
             elif args.transformation == 'replace_queries':
+                old_query = row[args.utterance_column]
                 for idx, new_query in enumerate(new_queries[row_idx]):
                     copy_row = row.copy()
                     copy_row[args.utterance_column] = new_query
@@ -152,7 +163,7 @@ def main(args):
             for o in output_rows:
                 output_row = ""
                 if args.remove_with_heuristics:
-                    if not passes_heuristic_checks(o, args):
+                    if not passes_heuristic_checks(o, args, old_query=old_query):
                         heuristic_count += 1
                         continue
                 if args.remove_duplicates:
@@ -170,7 +181,7 @@ def main(args):
                         output_row += '\t'
                 output_file.write(output_row + '\n')
             for o in thrown_away_rows:
-                if not args.remove_with_heuristics or (args.remove_with_heuristics and passes_heuristic_checks(o, args)):
+                if not args.remove_with_heuristics or (args.remove_with_heuristics and passes_heuristic_checks(o, args, old_query=old_query)):
                     all_thrown_away_rows.append(o)
 
         if args.thrown_away is not None:
diff --git a/genienlp/tasks/almond_task.py b/genienlp/tasks/almond_task.py
index 8cae86f8..70e25a0e 100644
--- a/genienlp/tasks/almond_task.py
+++ b/genienlp/tasks/almond_task.py
@@ -484,6 +484,83 @@ class NaturalSeq2Seq(BaseAlmondTask):
         return Example.from_raw(self.name + '/' + example_id, context, question, answer,
                                 preprocess=self.preprocess_field, lower=False)
 
+    def preprocess_field(self, sentence, field_name=None, answer=None):
+        if self.override_context is not None and field_name == 'context':
+            pad_feature = get_pad_feature(self.args.ned_features, self.args.ned_features_default_val, self.args.ned_features_size)
+            return self.override_context, [pad_feature] * len(self.override_context.split(' ')) if pad_feature else [], self.override_context
+        if self.override_question is not None and field_name == 'question':
+            pad_feature = get_pad_feature(self.args.ned_features, self.args.ned_features_default_val, self.args.ned_features_size)
+            return self.override_question, [pad_feature] * len(self.override_question.split(' ')) if pad_feature else [], self.override_question
+        if not sentence:
+            return '', [], ''
+        
+        tokens = sentence.split(' ')
+        new_tokens = []
+        for token in tokens:
+            new_tokens.append(token)
+        tokens = new_tokens
+        new_sentence = ' '.join(tokens)
+        
+        if self._almond_detokenize_sentence:
+            
+            # BERT tokenizers by default add whitespace around any CJK character
+            # SPM-based tokenizers are trained on raw text and do better when recieve untokenized text
+            # In genienlp we detokenize CJK characters and leave tokenization to the model's tokenizer
+            # NOTE: input datasets for almond are usually pretokenized using genie-toolkit which
+            # inserts whitespace around any CJK character. This detokenization ensures that SPM-based tokenizers
+            # see the text without space between those characters
+            new_sentence = detokenize_cjk_chars(new_sentence)
+            tokens = new_sentence.split(' ')
+            
+            new_sentence = ''
+            for token in tokens:
+                if token in (',', '.', '?', '!', ':', ')', ']', '}') or token.startswith("'"):
+                    new_sentence += token
+                else:
+                    new_sentence += ' ' + token
+        
+        new_sentence = new_sentence.strip()
+        new_tokens = new_sentence.split(' ')
+        new_sentence_length = len(new_tokens)
+        
+        tokens_type_ids, tokens_type_probs = None, None
+        
+        if 'type_id' in self.args.ned_features and field_name != 'answer':
+            tokens_type_ids = [[self.args.ned_features_default_val[0]] * self.args.ned_features_size[0] for _ in
+                               range(new_sentence_length)]
+        if 'type_prob' in self.args.ned_features and field_name != 'answer':
+            tokens_type_probs = [[self.args.ned_features_default_val[1]] * self.args.ned_features_size[1] for _ in
+                                 range(new_sentence_length)]
+        
+        if self.args.do_ned and self.args.ned_retrieve_method != 'bootleg' and field_name not in self.no_feature_fields:
+            if 'type_id' in self.args.ned_features:
+                tokens_type_ids = self.find_type_ids(new_tokens, answer)
+            if 'type_prob' in self.args.ned_features:
+                tokens_type_probs = self.find_type_probs(new_tokens, self.args.ned_features_default_val[1],
+                                                         self.args.ned_features_size[1])
+            
+            if self.args.verbose and self.args.do_ned:
+                print()
+                print(
+                    *[f'token: {token}\ttype: {token_type}' for token, token_type in zip(new_tokens, tokens_type_ids)],
+                    sep='\n')
+        
+        zip_list = []
+        if tokens_type_ids:
+            assert len(tokens_type_ids) == new_sentence_length
+            zip_list.append(tokens_type_ids)
+        if tokens_type_probs:
+            assert len(tokens_type_probs) == new_sentence_length
+            zip_list.append(tokens_type_probs)
+        
+        features = [Feature(*tup) for tup in zip(*zip_list)]
+
+        sentence_plus_types = ''
+        if self.args.do_ned and self.args.add_types_to_text != 'no' and len(features):
+            sentence_plus_types = self.create_sentence_plus_types_tokens(new_sentence, features, self.args.add_types_to_text)
+
+        return new_sentence, features, sentence_plus_types
+
     def get_splits(self, root, **kwargs):
         return AlmondDataset.return_splits(path=os.path.join(root, 'almond'), make_example=self._make_example, **kwargs)
 
@@ -518,7 +595,7 @@ class Paraphrase(NaturalSeq2Seq):
 
         sentence, reverse_map = input_heuristics(sentence, thingtalk=thingtalk, is_cased=True)
         # this task especially needs example ids to be unique
-        if example_id in self.reverse_maps:
+        while example_id in self.reverse_maps:
             example_id += '.'
         self.reverse_maps[example_id] = reverse_map
 

From 799c3a9e5c6ffd01e6c999412ddeac8a2006605b Mon Sep 17 00:00:00 2001
From: Sina <s.j.semnani@gmail.com>
Date: Mon, 15 Feb 2021 23:10:28 -0800
Subject: [PATCH 2/3] Fix bug in paraphrase posprocessing

---
 genienlp/paraphrase/scripts/transform_dataset.py | 2 +-
 genienlp/validate.py                             | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/genienlp/paraphrase/scripts/transform_dataset.py b/genienlp/paraphrase/scripts/transform_dataset.py
index 494d44e3..545f0d16 100644
--- a/genienlp/paraphrase/scripts/transform_dataset.py
+++ b/genienlp/paraphrase/scripts/transform_dataset.py
@@ -11,7 +11,7 @@ def is_subset(set1, set2):
     return all([e in set2 for e in set1])
 
 def passes_heuristic_checks(row, args, old_query=None):
-    if 'QUOTED_' in row[args.utterance_column]:
+    if 'QUOTED_STRING' in row[args.utterance_column] or (old_query is not None and 'QUOTED_STRING' in old_query):
         # remove quoted examples
         return False
     if old_query is not None:
diff --git a/genienlp/validate.py b/genienlp/validate.py
index 69644af6..9d39503c 100644
--- a/genienlp/validate.py
+++ b/genienlp/validate.py
@@ -67,11 +67,12 @@ def generate_with_model(model, data_iterator, numericalizer, task, args,
         batch_size = len(batch.example_id)
         batch_prediction = [[] for _ in range(batch_size)]
         batch_confidence_features = [[] for _ in range(batch_size)]
+        batch_example_ids = batch.example_id
 
-        example_ids += batch.example_id
+        example_ids += batch_example_ids
         if not output_predictions_only:
             batch_answer = numericalizer.reverse(batch.answer.value.data)
-            batch_answer = [task.postprocess_prediction(example_ids[i], batch_answer[i]) for i in range(len(batch_answer))]
+            batch_answer = [task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer))]
             answers += batch_answer
             batch_context = numericalizer.reverse(batch.context.value.data)
             contexts += batch_context
@@ -99,7 +100,7 @@ def generate_with_model(model, data_iterator, numericalizer, task, args,
             partial_batch_prediction = numericalizer.reverse(raw_partial_batch_prediction)
             # post-process predictions
             for i in range(len(partial_batch_prediction)):
-                partial_batch_prediction[i] = task.postprocess_prediction(example_ids[(i//args.num_outputs[hyperparameter_idx]) % batch_size], partial_batch_prediction[i])
+                partial_batch_prediction[i] = task.postprocess_prediction(batch_example_ids[(i//args.num_outputs[hyperparameter_idx]) % batch_size], partial_batch_prediction[i])
             # put them into the right array
             for i in range(len(partial_batch_prediction)):
                 batch_prediction[(i//args.num_outputs[hyperparameter_idx]) % batch_size].append(partial_batch_prediction[i])

From 1576f918aef5db2002c980af66aa5366cda58a47 Mon Sep 17 00:00:00 2001
From: Sina <s.j.semnani@gmail.com>
Date: Tue, 16 Feb 2021 16:47:22 -0800
Subject: [PATCH 3/3] Fix the hyphen bug in paraphrasing

---
 genienlp/paraphrase/scripts/transform_dataset.py | 13 ++++++++-----
 genienlp/util.py                                 |  5 +++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/genienlp/paraphrase/scripts/transform_dataset.py b/genienlp/paraphrase/scripts/transform_dataset.py
index 545f0d16..ccea33f0 100644
--- a/genienlp/paraphrase/scripts/transform_dataset.py
+++ b/genienlp/paraphrase/scripts/transform_dataset.py
@@ -4,6 +4,11 @@ import re
 from ...util import tokenize, lower_case, remove_thingtalk_quotes
 from ...data_utils.progbar import progress_bar
 
+special_token_pattern = re.compile("(^|(?<= ))" + "[A-Z]+_[0-9]" + "($|(?= ))")
+def find_special_tokens(s: str):
+    return list(sorted([a.group(0) for a in special_token_pattern.finditer(s)]))
+
+
 def is_subset(set1, set2):
     """
     Returns True if set1 is a subset of or equal to set2
@@ -15,14 +20,12 @@ def passes_heuristic_checks(row, args, old_query=None):
         # remove quoted examples
         return False
     if old_query is not None:
-        old_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', old_query))
-        new_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', row[args.utterance_column]))
         # check that all the special tokens in utterance after paraphrasing are the same as before
-        if set(old_special_tokens) != set(new_special_tokens):
+        if find_special_tokens(old_query) != find_special_tokens(row[args.utterance_column]):
             return False
     all_input_columns = ' '.join([row[c] for c in args.input_columns])
-    input_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', all_input_columns))
-    output_special_tokens = set(re.findall('[A-Za-z:_.]+_[0-9]', row[args.thingtalk_column]))
+    input_special_tokens = set(find_special_tokens(all_input_columns))
+    output_special_tokens = set(find_special_tokens(row[args.thingtalk_column]))
     if not is_subset(output_special_tokens, input_special_tokens):
         return False
     _, quote_values = remove_thingtalk_quotes(row[args.thingtalk_column])
diff --git a/genienlp/util.py b/genienlp/util.py
index a2452de8..0ba7f8b8 100644
--- a/genienlp/util.py
+++ b/genienlp/util.py
@@ -317,7 +317,7 @@ def unmask_special_tokens(string: str, exceptions: list):
 
 def detokenize(string: str):
     string, exceptions = mask_special_tokens(string)
-    tokens = ["'d", "n't", "'ve", "'m", "'re", "'ll", ".", ",", "?", "!", "'s", ")", ":"]
+    tokens = ["'d", "n't", "'ve", "'m", "'re", "'ll", ".", ",", "?", "!", "'s", ")", ":", "-"]
     for t in tokens:
         string = string.replace(' ' + t, t)
     string = string.replace("( ", "(")
@@ -335,8 +335,9 @@ def tokenize(string: str):
     string = string.replace("(", "( ")
     string = string.replace('gonna', 'gon na')
     string = string.replace('wanna', 'wan na')
-    string = re.sub('\s+', ' ', string)
     string = unmask_special_tokens(string, exceptions)
+    string = re.sub('([A-Za-z:_.]+_[0-9]+)-', r'\1 - ', string) # add space before and after hyphen, e.g. "NUMBER_0-hour"
+    string = re.sub('\s+', ' ', string) # remove duplicate spaces
     return string.strip()