diff --git a/.gitignore b/.gitignore
index d50d1667..9831d608 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ local_scripts/*
 dataset/*
 generated/*
 *events.out.tfevents*
+*.pkl
 
 .DS_Store
 .idea/
diff --git a/genienlp/calibrate.py b/genienlp/calibrate.py
index 998c0b45..e4241cca 100644
--- a/genienlp/calibrate.py
+++ b/genienlp/calibrate.py
@@ -230,6 +230,12 @@ class ConfidenceEstimator():
 
         return features, labels
 
+    def estimate(self, confidences: Iterable[ConfidenceOutput]):
+        features, labels = self.convert_to_dataset(confidences, train=False)
+        dataset = xgb.DMatrix(data=features, label=labels)
+        confidence_scores = ConfidenceEstimator._extract_confidence_scores(self.model, dataset)
+        return confidence_scores
+
     def evaluate(self, dev_features, dev_labels):
         dev_dataset = xgb.DMatrix(data=dev_features, label=dev_labels)
         confidence_scores = ConfidenceEstimator._extract_confidence_scores(self.model, dev_dataset)
diff --git a/genienlp/predict.py b/genienlp/predict.py
index 1fa6b98d..372ca9e1 100644
--- a/genienlp/predict.py
+++ b/genienlp/predict.py
@@ -167,9 +167,10 @@ def run(args, device):
                 else:
                     raise OSError(f'{results_file_name} already exists')
 
-            generation_outputs = generate_with_model(model, it, model.numericalizer, task, args, prediction_file_name, output_confidences=args.output_confidences, original_order=original_order)
+            generation_outputs = generate_with_model(model, it, model.numericalizer, task, args, output_confidences=args.output_confidences, original_order=original_order)
+            
             if args.output_confidences:
-                _, predictions, answers, contexts, confidences = generation_outputs
+                _, example_ids, predictions, answers, contexts, confidences = generation_outputs
                 # print('confidences = ', confidences)
                 
                 import pickle
@@ -177,7 +178,12 @@ def run(args, device):
                     pickle.dump(confidences, f, protocol=4)
 
             else:
-                _, predictions, answers, contexts = generation_outputs
+                _, example_ids, predictions, answers, contexts = generation_outputs
+
+            # write into file
+            with open(prediction_file_name, 'w' + ('' if args.overwrite else 'x')) as prediction_file:
+                for i in range(len(example_ids)):
+                    prediction_file.write(example_ids[i] + '\t' + '\t'.join(predictions[i]) + '\n') # write all outputs in the prediction file, separated by \t
 
             if len(answers) > 0:
                 metrics_to_compute = task.metrics
@@ -207,7 +213,7 @@ def run(args, device):
 
 
 def parse_argv(parser):
-    parser.add_argument('--path', required=True)
+    parser.add_argument('--path', type=str, required=True, help='Folder to load the model from')
     parser.add_argument('--evaluate', type=str, required=True, choices=['valid', 'test'],
                         help='Which dataset to do predictions for (test or dev)')
     parser.add_argument('--pred_set_name', type=str, help='Name of dataset to run prediction for; will be ignored if --evaluate is test')
diff --git a/genienlp/server.py b/genienlp/server.py
index 3abcb969..74717e45 100644
--- a/genienlp/server.py
+++ b/genienlp/server.py
@@ -86,7 +86,7 @@ class Server:
             self.model.add_new_vocab_from_data([task])
             batch = self.numericalize_examples(examples)
             # it is a single batch, so wrap it in []
-            predictions = generate_with_model(self.model, [batch], self.numericalizer, task, self.args, prediction_file_name=None, output_predictions_only=True)
+            predictions = generate_with_model(self.model, [batch], self.numericalizer, task, self.args, output_predictions_only=True)
 
             response = json.dumps({ 'id': request['id'], 'instances': [{ 'answer': p[0] } for p in predictions] })
             return response + '\n'
@@ -103,7 +103,7 @@ class Server:
 
             self.model.add_new_vocab_from_data([task])
             batch = self.numericalize_examples([ex])
-            predictions = generate_with_model(self.model, [batch], self.numericalizer, task, self.args, prediction_file_name=None, output_predictions_only=True)
+            predictions = generate_with_model(self.model, [batch], self.numericalizer, task, self.args, output_predictions_only=True)
 
             response = json.dumps(dict(id=request['id'], answer=predictions[0][0]))
             return response + '\n'
diff --git a/genienlp/validate.py b/genienlp/validate.py
index 3d0b2953..c2052582 100644
--- a/genienlp/validate.py
+++ b/genienlp/validate.py
@@ -35,7 +35,7 @@ from collections import OrderedDict
 from .metrics import compute_metrics
 
 
-def generate_with_model(model, data_iterator, numericalizer, task, args, prediction_file_name=None, output_predictions_only=False, output_confidences=False, original_order=None):
+def generate_with_model(model, data_iterator, numericalizer, task, args, output_predictions_only=False, output_confidences=False, original_order=None):
     """
     Inputs:
         original_order: List of indices. If provided, we will sort the results according to this order
@@ -96,20 +96,16 @@ def generate_with_model(model, data_iterator, numericalizer, task, args, predict
     if original_order is not None:
         # sort back to the original order
         original_order, example_ids, predictions, answers, contexts, confidences = [list(a) for a in tuple(zip(*sorted(list(zip(original_order, example_ids, predictions, answers, contexts, confidences)))))]
-
-    if prediction_file_name is not None:
-        with open(prediction_file_name, 'w' + ('' if args.overwrite else 'x')) as prediction_file:
-            for i in range(len(example_ids)):
-                prediction_file.write(example_ids[i] + '\t' + '\t'.join(predictions[i]) + '\n') # write all outputs in the prediction file, separated by \t
     
-    if output_predictions_only:
-        return predictions
     # TODO calculate and return loss
     loss = None
+
+    if output_predictions_only:
+        return predictions
     if output_confidences:
-        return loss, predictions, answers, contexts, confidences
+        return loss, example_ids, predictions, answers, contexts, confidences
     else:
-        return loss, predictions, answers, contexts
+        return loss, example_ids, predictions, answers, contexts
 
 
 def calculate_and_reduce_metrics(predictions, answers, metrics_to_compute, args):
@@ -142,7 +138,7 @@ def validate(task, val_iter, model, numericalizer, args, num_print=10):
     with torch.no_grad():
         model.eval()
         names = ['beam search', 'answer', 'context']
-        loss, predictions, answers, contexts = generate_with_model(model, val_iter, numericalizer, task, args, prediction_file_name=None)
+        loss, _, predictions, answers, contexts = generate_with_model(model, val_iter, numericalizer, task, args)
 
         metrics = calculate_and_reduce_metrics(predictions, answers, task.metrics, args)
         results = [predictions, answers, contexts]