# Copyright (c) 2008 Yahoo! Inc. All rights reserved.
# Licensed under the Yahoo! Search BOSS Terms of Use
# (http://info.yahoo.com/legal/us/yahoo/search/bosstos/bosstos-2317.html)

import math
import porter

""" A simple text library for normalizing, cleaning, and overlapping strings """

__author__ = "Vik Singh (viksi@yahoo-inc.com)"

STOPWORDS = set(["i", "a", "about", "an", "are", "as", "at", "be", "by", "com", "de", "en", "for", "from",
                 "how", "in", "is", "it", "it's", "la", "of", "on", "or", "that", "the", "this", "to", "was",
                 "what", "when", "where", "who", "will", "with", "und", "the", "to", "www", "your", "you're"])

P = porter.PorterStemmer()

def strip_enclosed_carrots(s):
  i = s.find("<")
  if i >= 0:
    j = s.find(">", i)
    if j > i:
      j1 = j + 1
      if j1 >= len(s):
        return strip_enclosed_carrots(s[:i])
      else:
        return strip_enclosed_carrots(s[:i] + s[j1:])
  return s

def filter_stops(words):
  return filter(lambda w: w not in STOPWORDS, words) 

def uniques(s):
  return set( tokenize(s) )

def tokenize(s):
  return filter(lambda w2: len(w2.strip()) > 0, map(lambda w1: P.stem(w1), filter_stops(map(lambda t: t.lower().strip("\'\"`,.;-!"), s.split()))))

def norm(s):
  return "".join( sorted( tokenize(s) ) )

def overlap(s1, s2):
  return len(uniques(s1) & uniques(s2))

def wc(tokens):
  d = {}
  for t in tokens:
    if t in d:
      d[t] += 1
    else:
      d[t] = 1
  return d

def euclid(wc):
  n = math.sqrt(sum(map(lambda v: v**2, wc.values())))
  d = {}
  for w, c in wc.iteritems():
    d[w] = c/n
  return d

def cosine(v1, v2):
  s = 0
  for k, v in v1.iteritems():
    if k in v2:
      s += v * v2[k]
  return s

def vectorize(s):
  return euclid( wc( tokenize(s) ) )

def sim(s1, s2):
  s1v = vectorize(s1)
  s2v = vectorize(s2)

  return cosine(s1v, s2v)
