Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Load word frequencies from a "Web as Corpus" file, collected and
provided by the University of Leeds.
For more information, see: http://corpus.leeds.ac.uk/list.html
"""
infile = codecs.open(filename, encoding='utf-8')
counts = defaultdict(float)
for line in infile:
line = line.rstrip()
if line:
rank = line.split(' ')[0]
if NUMBER_RE.match(rank) and line.count(' ') == 2:
_, freq, token = line.split(' ')
token = standardize_word(ftfy(token))
freq = float(freq)
counts[token] += freq
return _scale_freqs(counts)
def leeds_corpus_frequencies(corpusfile, stemmer):
if stemmer is None:
stemmer = lambda x: x
infile = codecs.open(corpusfile, encoding='utf-8')
freqs = defaultdict(int)
tokenfreqs = defaultdict(int)
for line in infile:
line = ftfy(line.strip())
if line:
rank = line.split(' ')[0]
if NUMBER_RE.match(rank) and line.count(' ') == 2:
rank, freq, token = line.split(' ')
stemmed = stemmer(token)
print "%s -> %s" % (token, stemmed)
freq = float(freq)
freq_int = int(freq*100)
for word in stemmed.split(' '):
if ',' not in word:
freqs[word] += freq_int
if ',' not in token:
tokenfreqs[token.lower()] += freq_int
for key in tokenfreqs:
if tokenfreqs[key] > freqs[key]:
freqs[key] = tokenfreqs[key]
data['content'] = data['content'].apply(lambda x: ftfy.ftfy(x) if type(x) == str else x)