How to use preshed - 10 common examples

To help you get started, we’ve selected a few preshed examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / preshed / tests / test_counter.py View on Github external
def test_large_freqs():
    if 'TEST_FILE_LOC' in os.environ:
        loc = os.environ['TEST_FILE_LOC']
    else:
        return None
    counts = PreshCounter()
    for i, line in enumerate(open(loc)):
        line = line.strip()
        if not line:
            continue
        freq = int(line.split()[0])
        counts.inc(i+1, freq)
    oov = i+2
    assert counts.prob(oov) == 0.0
    assert counts.prob(1) < 0.1
    counts.smooth()
    assert counts.prob(oov) > 0
    assert counts.prob(oov) < counts.prob(i)
github explosion / preshed / tests / test_counter.py View on Github external
def test_count():
    counter = PreshCounter()
    assert counter[12] == 0
    counter.inc(12, 1)
    assert counter[12] == 1
    counter.inc(14, 10)
    counter.inc(9, 10)
    counter.inc(12, 4)
    assert counter[12] == 5
    assert counter[14] == 10
    assert counter[9] == 10
github explosion / preshed / tests / test_hashing.py View on Github external
def test_resize():
    h = PreshMap(4)
    h[4] = 12
    for i in range(10, 100):
        value = int(i * (random.random() + 1))
        h[i] = value
    assert h[4] == 12
github explosion / preshed / tests / test_hashing.py View on Github external
def test_insert():
    h = PreshMap()
    assert h[1] is None
    h[1] = 5
    assert h[1] == 5
    h[2] = 6
    assert h[1] == 5
    assert h[2] == 6
github explosion / preshed / tests / test_hashing.py View on Github external
def test_iter():
    key_sum = 0
    val_sum = 0
    h = PreshMap()
    for i in range(56, 24, -3):
        h[i] = i * 2
        key_sum += i
        val_sum += i * 2
    for key, value in h.items():
        key_sum -= key
        val_sum -= value
    assert key_sum == 0
    assert val_sum == 0
github explosion / preshed / tests / test_hashing.py View on Github external
def test_zero_key():
    h = PreshMap()
    h[0] = 6
    h[5] = 12
    assert h[0] == 6
    assert h[5] == 12

    for i in range(500, 1000):
        h[i] = i * random.random()
    assert h[0] == 6
    assert h[5] == 12
github explosion / spaCy / bin / init_model.py View on Github external
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
    counts = PreshCounter()
    total = 0
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    for i, line in enumerate(file_):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
github explosion / spacy-dev-resources / training / word_freqs.py View on Github external
def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
        with io.open(loc, 'r', encoding='utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
github explosion / spaCy / spacy / cli / init_model.py View on Github external
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
            freq, doc_freq, key = line.rstrip().split("\t", 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
            freq, doc_freq, key = line.rstrip().split("\t", 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:

preshed

Cython hash table that trusts the keys are pre-hashed

MIT
Latest version published 1 year ago

Package Health Score

70 / 100
Full package analysis