spaCy/spacy/ptb3.pyx

'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes.  Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals


from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t


cimport spacy

import re

from spacy import orth

TAG_THRESH = 0.5
UPPER_THRESH = 0.2
LOWER_THRESH = 0.5
TITLE_THRESH = 0.7

NR_FLAGS = 0

OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1

IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
IS_UPPER = NR_FLAGS; NR_FLAGS += 1

CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
CAN_DET = NR_FLAGS; NR_FLAGS += 1
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1


# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                 re.compile(r"(?i)\b(d)('ye)\b"),
                 re.compile(r"(?i)\b(gim)(me)\b"),
                 re.compile(r"(?i)\b(gon)(na)\b"),
                 re.compile(r"(?i)\b(got)(ta)\b"),
                 re.compile(r"(?i)\b(lem)(me)\b"),
                 re.compile(r"(?i)\b(mor)('n)\b"),
                 re.compile(r"(?i)\b(wan)(na) ")]

CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
                 re.compile(r"(?i) ('t)(was)\b")]

CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]

def nltk_regex_tokenize(text):
    # Implementation taken from NLTK 3.0, based on tokenizer.sed

    #starting quotes
    text = re.sub(r'^\"', r'``', text)
    text = re.sub(r'(``)', r' \1 ', text)
    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

    #punctuation
    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
    text = re.sub(r'\.\.\.', r' ... ', text)
    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
    text = re.sub(r'[?!]', r' \g<0> ', text)

    text = re.sub(r"([^'])' ", r"\1 ' ", text)

    #parens, brackets, etc.
    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
    text = re.sub(r'--', r' -- ', text)

    #add extra space to make things easier
    text = " " + text + " "

    #ending quotes
    text = re.sub(r'"', " '' ", text)
    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                  text)

    for regexp in CONTRACTIONS2:
        text = regexp.sub(r' \1 \2 ', text)
    for regexp in CONTRACTIONS3:
        text = regexp.sub(r' \1 \2 ', text)

    # We are not using CONTRACTIONS4 since
    # they are also commented out in the SED scripts
    # for regexp in self.CONTRACTIONS4:
    #     text = regexp.sub(r' \1 \2 \3 ', text)

    return text.split()


cdef class PennTreebank3(Language):
    """Fully PTB compatible English tokenizer, tightly coupled to lexicon.

    Attributes:
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """


    def __cinit__(self, name):
        flag_funcs = [0 for _ in range(NR_FLAGS)]

        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)

        flag_funcs[IS_ALPHA] = orth.is_alpha
        flag_funcs[IS_DIGIT] = orth.is_digit
        flag_funcs[IS_PUNCT] = orth.is_punct
        flag_funcs[IS_SPACE] = orth.is_space
        flag_funcs[IS_TITLE] = orth.is_title
        flag_funcs[IS_LOWER] = orth.is_lower
        flag_funcs[IS_UPPER] = orth.is_upper

        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)

        Language.__init__(self, name, flag_funcs)


    cdef list _split(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
        if strings[-1] == '.':
            strings.pop()
            strings[-1] += '.'
        assert strings
        return strings


PTB3 = PennTreebank3('ptb3')