import re
from nltk.stem.snowball import SnowballStemmer


SEGMENTOR = re.compile(ur'\'|(?:^[^A-Za-z0-9\s\'])?(?:[^a-z\_\s\'\.]+$|[^a-z\_\s\'\.]+[0-9\.]|[^a-z\_\s\'\.]+(?![a-z])|[A-Z][^A-Z0-9\_\s\'\.]+\.?|[^A-Z0-9\_\s\'\.]+\.?)')


def segment_simple(ident):
  words = SEGMENTOR.findall(ident)
  if len(words) == 0:
    return ident
  return words


STEM_SIZE = 4

def segment(ident):
  words = map(lambda w: w[:STEM_SIZE], SEGMENTOR.findall(ident))
  if len(words) == 0:
    return [ident[:STEM_SIZE]]
  return words
  # words = SEGMENTOR.findall(ident)
  # if len(words) == 0:
  #   return [ident]
  # return words


STEMMER = SnowballStemmer("english")


def split_into_prefix_suffix(word):
  word = word.lower()

  if word[-1] == ".":
    prefix = STEMMER.stem(word[0:-1])
  else:
    prefix = STEMMER.stem(word)

  for i in xrange(len(word)):
    if i >= len(prefix) or prefix[i] != word[i]:
      return [prefix, word[i:]]

  return [prefix, u""]


def split_into_prefix_and_perhaps_suffix(word):
  prefix, suffix = split_into_prefix_suffix(word)

  # Preserve capitalization of first letter to avoid
  # collision between type word "A" and type var "a"
  #
  # Other possible collisions are handled by preferring most common.
  prefix = word[0] + prefix[1:]

  if suffix == "":
    return [prefix]
  else:
    return [prefix, u"-" + suffix] # Avoid suffix collision with type var "d" "e" etc...


# Also splits words into prefix/suffix. And downcases beyond first letter.
#
# "Typed.TypedDefinition'" => ["Type", "-d.", "Type", "-d", "Definit", "-ion", "'"]
def segment2(ident):
  words = sum(map(split_into_prefix_and_perhaps_suffix, SEGMENTOR.findall(ident)), [])
  if len(words) == 0:
    return ident
  return words
