Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_large_freqs():
if 'TEST_FILE_LOC' in os.environ:
loc = os.environ['TEST_FILE_LOC']
else:
return None
counts = PreshCounter()
for i, line in enumerate(open(loc)):
line = line.strip()
if not line:
continue
freq = int(line.split()[0])
counts.inc(i+1, freq)
oov = i+2
assert counts.prob(oov) == 0.0
assert counts.prob(1) < 0.1
counts.smooth()
assert counts.prob(oov) > 0
assert counts.prob(oov) < counts.prob(i)
def test_count():
counter = PreshCounter()
assert counter[12] == 0
counter.inc(12, 1)
assert counter[12] == 1
counter.inc(14, 10)
counter.inc(9, 10)
counter.inc(12, 4)
assert counter[12] == 5
assert counter[14] == 10
assert counter[9] == 10
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
if not loc.exists():
print("Warning: Frequencies file not found")
return {}, 0.0
counts = PreshCounter()
total = 0
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
for i, line in enumerate(file_):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
def merge_counts(locs, out_loc):
string_map = StringStore()
counts = PreshCounter()
for loc in locs:
with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map[word]
counts.inc(orth, int(freq))
with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%s\n' % (count, string))
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
probs = {}
with freqs_loc.open() as f:
for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
print("Counting frequencies...")
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
probs = {}
with freqs_loc.open() as f:
for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
print("Counting frequencies...")
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
probs = {}
with freqs_loc.open() as f:
for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
def _read_probs_from_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
if not loc.exists():
print("Warning: Frequencies file not found")
return {}, 0.0
counts = PreshCounter()
total = 0
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
for i, line in enumerate(file_):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
def make_matcher(patterns_path, text_path, counts_loc, n=10000000):
nlp = English(parser=False, tagger=False, entity=False)
print("Make matcher")
phrases = read_gazetteer(nlp.tokenizer, patterns_path, n=n)
counts = PreshCounter()
for mwe in get_matches(nlp.tokenizer, phrases, open(text_path)):
counts.inc(hash_string(mwe.text), 1)
with codecs.open(counts_loc, 'w', 'utf8') as file_:
for phrase in read_gazetteer(nlp.tokenizer, patterns_path, n=n):
text = phrase.string
key = hash_string(text)
count = counts[key]
if count != 0:
file_.write('%d\t%s\n' % (count, text))
def __init__(self, directory, min_freq=10):
self.directory = directory
self.counts = PreshCounter()
self.strings = {}
self.min_freq = min_freq