diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 5cb027d0e..369b0762c 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -4,11 +4,9 @@ from __future__ import unicode_literals
 import os
 import datetime
 from os import listdir
-from random import shuffle
 import numpy as np
 import random
 from thinc.neural._classes.convolution import ExtractWindow
-from thinc.neural._classes.feature_extracter import FeatureExtracter
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
@@ -49,9 +47,6 @@ class EL_Model:
 
         self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH)
 
-        # self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
-        # self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
-
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         # raise errors instead of runtime warnings in case of int/float overflow
         np.seterr(all='raise')
@@ -69,16 +64,13 @@ class EL_Model:
                                                                            True,
                                                                            devlimit,
                                                                            to_print=False)
-
-        # self.sgd_entity = self.begin_training(self.entity_encoder)
-        # self.sgd_article = self.begin_training(self.article_encoder)
         self._begin_training()
 
         if self.PRINT_F:
-            _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
+            _, _, f_avg_train = -3.42, -3.42, -3.42   # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
             _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
             _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True)
-            _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
+            _, _, f_avg_dev = -3.42, -3.42, -3.42   # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
             _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
             _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True)
 
@@ -97,18 +89,6 @@ class EL_Model:
             print("Dev test on", len(dev_instances.values()), "articles")
             print()
 
-        # for article_id, inst_cluster_set in train_instances.items():
-            # article_doc = train_doc[article_id]
-            # print("training on", article_id, inst_cluster_set)
-            # pos_ex_list = list()
-            # neg_exs_list = list()
-            # for inst_cluster in inst_cluster_set:
-                # instance_count += 1
-                # pos_ex_list.append(train_pos.get(inst_cluster))
-                # neg_exs_list.append(train_neg.get(inst_cluster, []))
-
-            #self.update(article_doc, pos_ex_list, neg_exs_list)
-
         article_docs = list()
         entities = list()
         golds = list()
@@ -130,9 +110,9 @@ class EL_Model:
 
             # eval again
             if self.PRINT_F:
-                _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
+                _, _, f_avg_train = -3.42, -3.42, -3.42  # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
                 _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
-                _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
+                _, _, f_avg_dev = -3.42, -3.42, -3.42  # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
                 _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
 
                 print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1))
@@ -142,39 +122,6 @@ class EL_Model:
         if to_print:
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
-    def _test_dev_depr(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
-        predictions = list()
-        golds = list()
-
-        for article_id, inst_cluster_set in dev_instances.items():
-            for inst_cluster in inst_cluster_set:
-                pos_ex = dev_pos.get(inst_cluster)
-                neg_exs = dev_neg.get(inst_cluster, [])
-                ex_to_id = dict()
-
-                if pos_ex and neg_exs:
-                    ex_to_id[pos_ex] = pos_ex._.entity_id
-                    for neg_ex in neg_exs:
-                        ex_to_id[neg_ex] = neg_ex._.entity_id
-
-                    article = inst_cluster.split(sep="_")[0]
-                    entity_id = inst_cluster.split(sep="_")[1]
-                    article_doc = dev_doc[article]
-
-                    examples = list(neg_exs)
-                    examples.append(pos_ex)
-                    shuffle(examples)
-
-                    best_entity, highest_prob = self._predict(examples, article_doc, avg)
-                    if calc_random:
-                        best_entity, highest_prob = self._predict_random(examples)
-                    predictions.append(ex_to_id[best_entity])
-                    golds.append(ex_to_id[pos_ex])
-
-        # TODO: use lowest_mse and combine with prior probability
-        p, r, f = run_el.evaluate(predictions, golds, to_print=False)
-        return p, r, f
-
     def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
         predictions = list()
         golds = list()
@@ -207,33 +154,6 @@ class EL_Model:
         p, r, f = run_el.evaluate(predictions, golds, to_print=False)
         return p, r, f
 
-    def _predict_depr(self, entities, article_doc, avg=False):
-        if avg:
-            with self.article_encoder.use_params(self.sgd_article.averages):
-                doc_encoding = self.article_encoder([article_doc])
-        else:
-            doc_encoding = self.article_encoder([article_doc])
-
-        highest_prob = None
-        best_entity = None
-
-        entity_to_vector = dict()
-        for entity in entities:
-            if avg:
-                with self.entity_encoder.use_params(self.sgd_entity.averages):
-                    entity_to_vector[entity] = self.entity_encoder([entity])
-            else:
-                entity_to_vector[entity] = self.entity_encoder([entity])
-
-        for entity in entities:
-            entity_encoding = entity_to_vector[entity]
-            prob = self._calculate_probability(doc_encoding, entity_encoding, entity_to_vector.values())
-            if not best_entity or prob > highest_prob:
-                highest_prob = prob
-                best_entity = entity
-
-        return best_entity, highest_prob
-
     def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
         if avg:
             with self.sgd.use_params(self.model.averages):
@@ -252,11 +172,6 @@ class EL_Model:
             return float(1.0)
         return float(0.0)
 
-    def _predict_random_depr(self, entities):
-        highest_prob = 1
-        best_entity = random.choice(entities)
-        return best_entity, highest_prob
-
     def _predict_random(self, entity, apply_threshold=True):
         r = random.uniform(0, 1)
         if not apply_threshold:
@@ -275,29 +190,12 @@ class EL_Model:
 
             convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3))))
 
-            # self.entity_encoder | self.article_encoder \
-            # self.model = with_flatten(LN(Maxout(hidden_with, hidden_with)) >> convolution_2 ** 2, pad=2)  \
-            #          >> flatten_add_lengths \
-            #          >> ParametricAttention(hidden_with) \
-            #          >> Pooling(sum_pool) \
-            #          >> Softmax(nr_class, nr_class)
-
             self.model = Affine(hidden_output_with, hidden_input_with) \
                        >> LN(Maxout(hidden_output_with, hidden_output_with)) \
                        >> convolution_2 \
                        >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \
                        >> Affine(1, self.HIDDEN_2_WIDTH) \
                        >> logistic
-                       # >> with_flatten(LN(Maxout(hidden_output_with, hidden_output_with)) >> convolution_2 ** 2, pad=2)
-
-                    #  >> convolution_2 \
-
-                       #  >> flatten_add_lengths
-                       #  >> ParametricAttention(hidden_output_with) \
-                       #  >> Pooling(max_pool) \
-                       #  >> Softmax(nr_class, nr_class)
-
-        # self.model.nO = nr_class
 
     @staticmethod
     def _encoder(in_width, hidden_width):
@@ -311,138 +209,9 @@ class EL_Model:
 
         return encoder
 
-    def begin_training_depr(self, model):
-        # TODO ? link_vectors_to_models(self.vocab) depr?
-        sgd = create_default_optimizer(model.ops)
-        return sgd
-
     def _begin_training(self):
-        # self.sgd_entity = self.begin_training(self.entity_encoder)
-        # self.sgd_article = self.begin_training(self.article_encoder)
         self.sgd = create_default_optimizer(self.model.ops)
 
-    # TODO: deprecated ?
-    def _simple_encoder_depr(self, in_width, out_width):
-        hidden_with = 128
-
-        conv_depth = 1
-        cnn_maxout_pieces = 3
-        with Model.define_operators({">>": chain, "**": clone}):
-            # encoder = SpacyVectors \
-            #            >> flatten_add_lengths \
-            #           >> ParametricAttention(in_width)\
-            #            >> Pooling(mean_pool) \
-            #           >> Residual(zero_init(Maxout(in_width, in_width)))  \
-            #           >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-            # encoder = SpacyVectors \
-            #         >> flatten_add_lengths \
-            #         >> with_getitem(0, Affine(in_width, in_width)) \
-            #         >> ParametricAttention(in_width) \
-            #         >> Pooling(sum_pool) \
-            #         >> Residual(ReLu(in_width, in_width)) ** conv_depth \
-            #         >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-            # encoder = SpacyVectors \
-            #        >> flatten_add_lengths \
-            #        >> ParametricAttention(in_width)\
-            #        >> Pooling(sum_pool) \
-            #        >> Residual(zero_init(Maxout(in_width, in_width)))  \
-            #        >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-
-            # >> zero_init(Affine(nr_class, width, drop_factor=0.0))
-            # >> logistic
-
-            #convolution = Residual(ExtractWindow(nW=1)
-            #                       >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces))
-            #)
-            #encoder = SpacyVectors >> with_flatten(
-            #    embed >> convolution ** conv_depth, pad=conv_depth
-            #)
-
-            # static_vectors = SpacyVectors >> with_flatten(
-            #    Affine(in_width, in_width)
-            #)
-
-            convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3))))
-
-            encoder = SpacyVectors >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution_2 ** 2, pad = 2)  \
-                      >> flatten_add_lengths \
-                      >> ParametricAttention(hidden_with) \
-                      >> Pooling(sum_pool) \
-                      >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \
-                      >> zero_init(Affine(out_width, hidden_with, drop_factor=0.0)) \
-                      >> logistic
-
-            # convolution = Residual(ExtractWindow(nW=1) >> ReLu(in_width, in_width*3))
-
-            # encoder = static_vectors # >> with_flatten(
-            #    ReLu(in_width, in_width)
-            #    >> convolution ** conv_depth, pad=conv_depth) \
-            #    >> Affine(out_width, in_width, drop_factor=0.0)
-
-            # encoder = SpacyVectors >> with_flatten(
-            #    LN(Maxout(in_width, in_width))
-            #    >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) ** conv_depth,
-            #    pad=conv_depth,
-            #)  >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-
-            # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3))
-
-            # encoder = SpacyVectors >> flatten_add_lengths >> convolution ** conv_depth
-            # encoder = with_flatten(embed >> convolution ** conv_depth, pad=conv_depth)
-
-        return encoder
-
-    def update_depr(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
-        doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
-        doc_encoding = doc_encoding[0]
-        # print()
-        # print("doc", doc_encoding)
-
-        for i, true_entity in enumerate(true_entity_list):
-            try:
-                false_entities = false_entities_list[i]
-                if len(false_entities) > 0:
-                    # TODO: batch per doc
-
-                    all_entities = [true_entity]
-                    all_entities.extend(false_entities)
-
-                    entity_encodings, entity_bp = self.entity_encoder.begin_update(all_entities, drop=drop)
-                    true_entity_encoding = entity_encodings[0]
-                    false_entity_encodings = entity_encodings[1:]
-
-                    all_vectors = [true_entity_encoding]
-                    all_vectors.extend(false_entity_encodings)
-
-                    # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
-
-                    true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors)
-                    # print("true", true_prob, true_entity_encoding)
-
-                    all_probs = [true_prob]
-                    for false_vector in false_entity_encodings:
-                        false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors)
-                        # print("false", false_prob, false_vector)
-                        all_probs.append(false_prob)
-
-                    loss = self._calculate_loss(true_prob, all_probs).astype(np.float32)
-                    if self.PRINT_LOSS:
-                        print("loss train", round(loss, 5))
-
-                    # for false_vector in false_vectors:
-                    #    false_gradient = -1 * self._calculate_entity_gradient(loss, doc_encoding, false_vector, false_vectors)
-                    #    print("false gradient", false_gradient)
-
-                    # doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings)
-                    true_gradient, doc_gradient = self._calculate_entity_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings)
-                    # print("true_gradient", true_gradient)
-                    # print("doc_gradient", doc_gradient)
-                    article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
-                    entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
-                    #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
-            except Exception as e:
-                pass
-
     def update(self, article_docs, entities, golds, drop=0.):
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
         entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop)
@@ -476,112 +245,6 @@ class EL_Model:
         bp_doc(doc_gradient)
         bp_encoding(entity_gradient)
 
-    def _calculate_probability_depr(self, vector1, vector2, allvectors):
-        """ Make sure that vector2 is included in allvectors """
-        if len(vector1) != len(vector2):
-            raise ValueError("To calculate similarity, both vectors should be of equal length")
-
-        vector1_t = vector1.transpose()
-        e = self._calculate_dot_exp(vector2, vector1_t)
-        e_sum = 0
-        for v in allvectors:
-            e_sum += self._calculate_dot_exp(v, vector1_t)
-
-        return float(e / (self.EPS + e_sum))
-
-    def _calculate_loss_depr(self, true_prob, all_probs):
-        """ all_probs should include true_prob ! """
-        return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs)))
-
-    @staticmethod
-    def _calculate_doc_gradient_depr(loss, doc_vector, true_vector, false_vectors):
-        gradient = np.zeros(len(doc_vector))
-        for i in range(len(doc_vector)):
-            min_false = min(x[i] for x in false_vectors)
-            max_false = max(x[i] for x in false_vectors)
-
-            if true_vector[i] > max_false:
-                if doc_vector[i] > 0:
-                    gradient[i] = 0
-                else:
-                    gradient[i] = -loss
-            elif true_vector[i] < min_false:
-                if doc_vector[i] > 0:
-                    gradient[i] = loss
-                if doc_vector[i] < 0:
-                    gradient[i] = 0
-            else:
-                # non-distinctive vector positions should converge to 0
-                gradient[i] = doc_vector[i]
-
-        return gradient
-
-    # TODO: delete ? try again ?
-    def depr__calculate_true_gradient(self, doc_vector, entity_vector):
-        # sum_entity_vector = sum(entity_vector)
-        # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
-        gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
-        return np.asarray(gradient)
-
-    def _calculate_losses_vector_depr(self, doc_vector, true_vector, false_vectors):
-        # prob_true = list()
-        # prob_false_dict = dict()
-
-        true_losses = list()
-        # false_losses_dict = dict()
-
-        for i in range(len(true_vector)):
-            doc_i = np.asarray([doc_vector[i]])
-            true_i = np.asarray([true_vector[i]])
-            falses_i = np.asarray([[fv[i]] for fv in false_vectors])
-            all_i = [true_i]
-            all_i.extend(falses_i)
-
-            prob_true_i = self._calculate_probability(doc_i, true_i, all_i)
-            # prob_true.append(prob_true_i)
-
-            # false_list = list()
-            all_probs_i = [prob_true_i]
-            for false_i in falses_i:
-                prob_false_i = self._calculate_probability(doc_i, false_i, all_i)
-                all_probs_i.append(prob_false_i)
-                # false_list.append(prob_false_i)
-            # prob_false_dict[i] = false_list
-
-            true_loss_i = self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32)
-            if doc_vector[i] > 0:
-                true_loss_i = -1 * true_loss_i
-            true_losses.append(true_loss_i)
-
-            # false_loss_list = list()
-            # for prob_false_i in false_list:
-                # false_loss_i = self._calculate_loss(prob_false_i, all_probs_i).astype(np.float32)
-                # false_loss_list.append(false_loss_i)
-            # false_losses_dict[i] = false_loss_list
-
-        return true_losses  # , false_losses_dict
-
-    def _calculate_entity_gradient_depr(self, loss, doc_vector, true_vector, false_vectors):
-        true_losses = self._calculate_losses_vector(doc_vector, true_vector, false_vectors)
-
-        # renormalize the gradient so that the total sum of abs values does not exceed the actual loss
-        loss_i = sum([abs(x) for x in true_losses])  # sum of absolute values
-        entity_gradient = [(x/2) * (loss/loss_i) for x in true_losses]
-        doc_gradient = [(x/2) * (loss/loss_i) for x in true_losses]
-
-        return np.asarray(entity_gradient), np.asarray(doc_gradient)
-
-
-    @staticmethod
-    def _calculate_dot_exp_depr(vector1, vector2_transposed):
-        dot_product = vector1.dot(vector2_transposed)
-        dot_product = min(50, dot_product)
-        dot_product = max(-10000, dot_product)
-        # print("DOT", dot_product)
-        e = np.exp(dot_product)
-        # print("E", e)
-        return e
-
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
 
@@ -589,7 +252,6 @@ class EL_Model:
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
-
         instance_by_doc = dict()
         local_vectors = list()   # TODO: local vectors
         doc_by_article = dict()