Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# spellings (color, colour, etc). It contains
# roughly half a million words. For this
# example, imagine it's just seven words...
#
# we (lower)
# flew (lower)
# to (lower)
# Abu (mixed)
# Dhabi (mixed)
# via (lower)
# Colombo (mixed)
LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt')
# {'we', 'flew', 'to', 'via'}
CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt',
map_case=True)
# {abu': 'Abu',
# 'dhabi': 'Dhabi',
# 'colombo': 'Colombo'}
#
# Note that en_US_GB_CA_mixed.txt also contains
# acronyms/mixed case variants of common words,
# so in reality, CASE_MAPPED also contains:
#
# {'to': 'TO',
# 'via': 'Via'}
MIXED_CASE = set(CASE_MAPPED.values())
# {'Abu', 'Dhabi', 'Colombo'}
LOWERED = set(CASE_MAPPED.keys())
def parse(lang_sample):
"""tally word popularity using novel extracts, etc"""
words = words_from_archive(lang_sample, include_dups=True)
counts = zero_default_dict()
for word in words:
counts[word] += 1
return set(words), counts
from autocorrect.utils import words_from_archive
# en_US_GB_CA is a superset of US, GB and CA
# spellings (color, colour, etc). It contains
# roughly half a million words. For this
# example, imagine it's just seven words...
#
# we (lower)
# flew (lower)
# to (lower)
# Abu (mixed)
# Dhabi (mixed)
# via (lower)
# Colombo (mixed)
LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt')
# {'we', 'flew', 'to', 'via'}
CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt',
map_case=True)
# {abu': 'Abu',
# 'dhabi': 'Dhabi',
# 'colombo': 'Colombo'}
#
# Note that en_US_GB_CA_mixed.txt also contains
# acronyms/mixed case variants of common words,
# so in reality, CASE_MAPPED also contains:
#
# {'to': 'TO',
# 'via': 'Via'}
MIXED_CASE = set(CASE_MAPPED.values())