Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import sys
import ngram
if __name__ == "__main__":
dataFile = sys.argv[1]
outFile = sys.argv[2]
beta = 0.125
freqFilter = [20,120,90]
lines = ngram.SentenceNgramSampler(dataFile, minCounts=freqFilter, ngramThresholdBeta=beta)
with open(outFile, "w") as f:
for i in lines:
f.write(" ".join(i) + "\n")
print("Job finished")
#Factored Estonian data:
tokFactor = 1
posFactor = 2
firstPosFilter = "A,S,H,V,X,D,G,U,Y"
lastPosFilter = "S,H,V,X,K,Y"
freqFilter = [5, 50]
somePosFilter = None
crazyBigMFCorpus = True
beta = 0.125
epochs = 10
logging.basicConfig(level = logging.INFO)
lines = ngram.SentenceNgramSampler(dataFile, minCounts = freqFilter, tokFactor = tokFactor, posFactor = posFactor, firstPosFilter = firstPosFilter, lastPosFilter = lastPosFilter, atLeastOnePosFilter = somePosFilter, ngramThresholdBeta = beta, crazyBigMFCorpus = crazyBigMFCorpus)
if len(freqFilter) > 1:
debug("Initializing")
for line in lines:
pass
if epochs > 0:
model = Word2Vec(workers=60, sg=1, hs=1, iter=10, min_count=freqFilter[0])
debug("Building vocab")
model.build_vocab(lines)
debug("Learning")
for i in range(epochs):
model.train(lines, total_examples = len(lines), epochs = 1)
model.save(modelFile + ".trainable." + str(i))
def _applyJoinOps(self, sentence, toJoin):
result = [t for t, _ in sentence]
for op in sorted(toJoin, key=lambda x: -min(x)):
result = result[:min(op)] + ["__".join([sentence[i][0] for i in sorted(op)])] + result[max(op)+1:]
return result
def __iter__(self):
return self
if __name__ == "__main__":
logging.basicConfig(level = logging.INFO)
sampler = SentenceNgramSampler(sys.argv[1], minCounts = [2, 2, 2])
for snt in sampler:
print(snt)
print("Second iteration")
for snt in sampler:
print(snt)