Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if len(sys.argv) != 3:
print('Usage: python3 sort.py target-lang pairs.csv')
sys.exit(1)
targetLang = sys.argv[1]
pairsPath = sys.argv[2]
pairs = {}
with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
reader = csv.reader(pairsFile, delimiter='\t')
for row in reader:
words = wordfreq.tokenize(html.unescape(row[0]), targetLang)
freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
for word in words]
minfreq = min(freqs)
avgfreq = sum(freqs) / float(len(freqs))
pairs[row[0]] = (minfreq, avgfreq, row[1])
pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])
for pair in pairList:
sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))
async def first_word(request):
w = random.choice(list(index))
while w not in first_words or wordfreq.zipf_frequency(w, 'en', wordlist="large") < 3.5:
w = random.choice(list(index))
return json({"word": w})
def canUse(candidate, past):
"""
Check whether a candidate is OK to use.
"""
candidateFrequency = wordfreq.zipf_frequency(candidate, "en", wordlist="large")
candidateRootFrequency = max(
candidateFrequency,
wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large"))
# Reject words that are too infrequent or too frequent (like "a" or "the")
if candidateFrequency < 2.3 or candidateFrequency > 6:
return False
# Mostly, this rejects '#'-containing words
if not candidate.isalpha():
return False
# Is it a bad word?
if candidate in bad_words:
return False
# Now, we check if we've used a related word before.
if any(map(lambda w: lexicallyRelated(candidate, w), past)):
return False
def get_freq_scores(self, group_similar_spans=True):
phrases_and_scores = {}
for _, noun_phrases in zip(self.documents, self.doc_text_spans):
for p in noun_phrases:
if p not in phrases_and_scores:
phrases_and_scores[p] = zipf_frequency(p.text, 'en')
return self._maybe_group_and_sort(group_similar_spans,
phrases_and_scores)
def canUse(candidate, past):
"""
Check whether a candidate is OK to use.
"""
candidateFrequency = wordfreq.zipf_frequency(candidate, "en", wordlist="large")
candidateRootFrequency = max(
candidateFrequency,
wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large"))
# Reject words that are too infrequent or too frequent (like "a" or "the")
if candidateFrequency < 2.3 or candidateFrequency > 6:
return False
# Mostly, this rejects '#'-containing words
if not candidate.isalpha():
return False
# Is it a bad word?
if candidate in bad_words:
return False