
import text

NONCHARS = ["`", "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "_", "-", "+", "=", "{", "}", "[", "]", "\\", "|", ":", ";", "\"", "'", "<", ",", ">", ".", "/", "?"]

STOPS = ["www", "com", "http", "php", "htm", "html", "asp", "aspx", "index", "org", "net"]

THRESHOLD = 0.67

def urlnorm(u):
  keep = []
  for c in u:
    if c in NONCHARS:
      keep.append(" ")
    else:
      keep.append(c)
  return "".join(keep)

def isdup(rf1, rf2):
  return text.cosine(rf1, rf2) >= THRESHOLD

class Dedup:
  def __init__(self, pres):
    self.results = []
    self._featuresets = []
    self.pres = pres

  def check(self, r):
    s = " ".join(map(lambda p: urlnorm(r[p]), self.pres))
    rf = text.vectorize(s)
    i = 0
    for fs in self._featuresets:
      if isdup(rf, fs):
        return
      i += 1
    self._featuresets.append(rf)
    self.results.append(r)
