diff --git a/genienlp/data_manipulation_scripts/clean_paraphrasing_dataset.py b/genienlp/data_manipulation_scripts/clean_paraphrasing_dataset.py index 8b13a2b7..057a957b 100644 --- a/genienlp/data_manipulation_scripts/clean_paraphrasing_dataset.py +++ b/genienlp/data_manipulation_scripts/clean_paraphrasing_dataset.py @@ -2,7 +2,7 @@ from argparse import ArgumentParser import csv import sys from tqdm import tqdm -from genienlp.util import detokenize, get_number_of_lines +from genienlp.util import detokenize csv.field_size_limit(sys.maxsize) diff --git a/genienlp/data_manipulation_scripts/dialog_to_tsv.py b/genienlp/data_manipulation_scripts/dialog_to_tsv.py index 339aa083..9cdaa16b 100644 --- a/genienlp/data_manipulation_scripts/dialog_to_tsv.py +++ b/genienlp/data_manipulation_scripts/dialog_to_tsv.py @@ -1,7 +1,5 @@ from argparse import ArgumentParser import csv -from tqdm import tqdm -import re def read_dialog_file(dialog_file, args): diff --git a/genienlp/run_generation.py b/genienlp/run_generation.py index 78de567b..d7422d64 100644 --- a/genienlp/run_generation.py +++ b/genienlp/run_generation.py @@ -51,7 +51,7 @@ from transformers import BertForMaskedLM, BertTokenizer from .util import set_seed, get_number_of_lines, combine_files_on_disk, split_file_on_disk, get_file_part_path, detokenize, tokenize, lower_case, \ top_k_top_p_filtering, SpecialTokenMap, remove_thingtalk_quotes from .metrics import computeBLEU -from .models.common import BeamHypotheses +# from .models.common import BeamHypotheses logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', @@ -331,7 +331,7 @@ def input_heuristics(s: str, thingtalk=None, is_cased=False, keep_special_tokens s = tokenize(s) # Put question mark at the end whenever necessary. - sentences = [sentence.strip() for sentence in re.split('\s+([.|?|!|:])\s*', s) if len(sentence) > 0] + sentences = [sentence.strip() for sentence in re.split('\s+([.?!:])\s*', s) if len(sentence) > 0] # print('sentences = ', sentences) for idx in range(len(sentences)): if sentences[idx] in ['.', '?' , '!', ':']: diff --git a/genienlp/run_lm_finetuning.py b/genienlp/run_lm_finetuning.py index 896efc79..c7dae171 100644 --- a/genienlp/run_lm_finetuning.py +++ b/genienlp/run_lm_finetuning.py @@ -32,7 +32,6 @@ import shutil import torch import math import csv -import numpy as np from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler from torch.nn.utils.rnn import pad_sequence from torch.utils.data.distributed import DistributedSampler