Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
desc='THRESHOLD DETERMINATION',
total=len(self.pairs)-len(self.cols)) as progress:
for l1, l2 in self.pairs:
progress.update(1)
if l1 != l2:
pairs = self.pairs[l1, l2]
for p1, p2 in pairs:
dx = [align(p1, pairs[random.randint(
0, len(pairs) - 1)][1])
for i in range(len(pairs) // 20 or 5)]
thresholds.extend(dx)
if thresholds:
threshold = sum(thresholds) / len(thresholds) * 0.5
self._meta['guessed_threshold'] = threshold
with util.pb(
desc='SEQUENCE CLUSTERING',
total=len(self.rows)) as progress:
for concept, indices, matrix in matrices:
progress.update(1)
# check for keyword to guess the threshold
if kw['guess_threshold'] and kw['gt_mode'] == 'item':
t = clustering.best_threshold(matrix, kw['gt_trange'])
# FIXME: considering new function here JML
# elif kw['guess_threshold'] and kw['gt_mode'] == 'nullditem':
# pass
else:
t = threshold
c = fclust(matrix, t)
while len(words) < kw['rands']:
words += [words[random.randint(0, len(words)-1)]]
seqs[taxon], pros[taxon], weights[taxon] = [], [], []
for w in words:
cls = tokens2class(w.split(' '), self.model,
cldf=self._cldf)
pros[taxon].append(prosodic_string(w.split(' ')))
weights[taxon].append(prosodic_weights(pros[taxon][-1]))
seqs[taxon].append([
'{0}.{1}'.format(c, p) for c, p in zip(
cls,
[self._transform[pr] for pr in pros[taxon][-1]]
)])
with util.pb(
desc='RANDOM CORRESPONDENCE CALCULATION',
total=tasks) as progress:
for (i, tA), (j, tB) in util.multicombinations2(
enumerate(self.cols)):
progress.update(1)
log.info(
"Calculating random alignments"
" for pair {0}/{1}.".format(tA, tB)
)
corrdist[tA, tB] = defaultdict(float)
for mode, gop, scale in kw['modes']:
corrs, included = calign.corrdist(
10.0,
[(seqs[tA][x], seqs[tB][y]) for x, y in sample],
[(weights[tA][x], weights[tB][y]) for x, y in
sample],
merge_vowels=merge_vowels, brackets=None, ignore_brackets=False,
split_entries=False, preparse=None, rules=None,
merge_geminates=merge_geminates)[0]
# retain whole word if there are splitters in the word
if [x for x in cleaned_string if x in brackets + splitters]:
profile[word] += 1
bad_words.add(word)
else:
for segment in cleaned_string.split(' '):
profile[segment] += 1
for segment in [x for x in word if x not in cleaned_string]:
profile[segment] += 1
nulls.add(segment)
for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True),
desc='preparing profile'):
sclass = token2class(s, 'dolgo')
if s in bad_words:
ipa = bad_word.format(s)
elif sclass == '0' and s not in nulls:
ipa = bad_sound.format(s)
elif s in nulls:
ipa = 'NULL'
elif clts:
sound = clts.get(s, False)
if not sound:
ipa = '!'+s
else:
ipa = text_type(sound)
else:
ipa = s
modes=rcParams['lexstat_modes'],
factor=rcParams['align_factor'],
restricted_chars=rcParams['restricted_chars'],
runs=rcParams['lexstat_runs'],
rands=rcParams['lexstat_rands'],
limit=rcParams['lexstat_limit'],
method=rcParams['lexstat_scoring_method'])
kw.update(keywords)
# determine the mode
method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
else 'shuffle'
corrdist = {}
tasks = (self.width ** 2) / 2
with util.pb(
desc='RANDOM CORRESPONDENCE CALCULATION',
total=tasks) as progress:
for (i, tA), (j, tB) in util.multicombinations2(
enumerate(self.cols)):
progress.update(1)
log.info(
"Calculating random alignments"
"for pair {0}/{1}.".format(tA, tB)
)
corrdist[tA, tB] = defaultdict(float)
# create morpheme-segmented pairs
pairs = self.pairs[tA, tB]
new_nums, new_weights, new_pros = [], [], []
for idxA, idxB in pairs:
for iA, iB in self._slices[idxA]:
else:
context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
context_post = (len(cleaned_string)-1) * [''] + ['$']
for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string):
profile[ctxA+segment+ctxB] += [(language, word)]
for segment in [x for x in word if x not in ' '.join(cleaned_string)]:
profile[segment] += [(language, word)]
nulls.add(segment)
except:
errors.add(idx)
log.warn('problem parsing {0}'.format(word))
for s in '^$':
yield s, 'NULL', '', '', '', ''
for idx, (s, entries) in pb(enumerate(sorted(profile.items(), key=lambda x:
len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)):
sclass = token2class(s.strip('^$'), 'dolgo')
words, langs = [l[1] for l in entries][:max_entries], [l[0] for l in
entries][:max_entries]
languages = ', '.join(sorted(set(langs), key=lambda x: langs.count(x),
reverse=True))
frequency = str(len(langs))
codepoints = codepoint(s)
examples_ = ', '.join(sorted(set(words), key=lambda x:
words.count(x), reverse=True)[:examples])
if s in bad_words:
ipa = bad_word.format(s)
elif sclass == '0':
ipa = bad_sound.format(s)
elif s in nulls:
ipa = 'NULL'
rcParams['ref'] = keywords['ref']
# reassing ref for convenience
ref = keywords['ref']
# check for existing alignments
test = list(self.msa[ref].keys())[0]
if 'alignment' not in self.msa[ref][test]:
log.error(
"No alignments could be found. You should carry out"
" an alignment analysis first!")
return
# go on with the analysis
cons_dict = {}
with util.pb(desc='CONSENSUS', total=len(self.etd[ref])) as progress:
for cog in self.etd[ref]:
progress.update(1)
if cog in self.msa[ref]:
log.debug("Analyzing cognate set number '{0}'...".format(cog))
# temporary solution for sound-class integration
if classes == True:
_classes = []
if weights:
keywords['weights'] = prosodic_weights(
prosodic_string(self.msa[ref][cog]['_sonority_consensus'])
)
else:
keywords['weights'] = [
1.0 for i in range(len(self.msa[ref][cog]['alignment']))]
# check for parameters and add clustering, in order to make sure that
# analyses are not repeated
if not hasattr(self, 'params'):
self.params = {}
self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
method, cluster_method, threshold)
self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']
matrices = self._get_partial_matrices(method=method, scale=scale,
factor=factor, restricted_chars=restricted_chars, mode=mode,
gop=gop, imap_mode=kw['imap_mode'],
split_on_tones=split_on_tones)
k = 0
C = defaultdict(list) # stores the pcogids
G = {} # stores the graphs
with util.pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress:
for concept, trace, matrix in matrices:
progress.update(1)
lingpy.log.info('Analyzing concept {0}...'.format(concept))
if external_function:
c = external_function(threshold, matrix,
taxa=list(range(len(matrix))), revert=True)
elif cluster_method == 'infomap':
c = extra.infomap_clustering(threshold,
matrix, taxa=list(range(len(matrix))),
revert=True)
elif cluster_method == 'mcl':
c = clustering.mcl(threshold, matrix,
taxa = list(range(len(matrix))),
max_steps=kw['max_steps'],
inflation=kw['inflation'],
expansion=kw['expansion'],
subset=False)
kw.update(keywords)
self._included = {}
corrdist = {}
if kw['preprocessing']:
if kw['ref'] not in self.header:
self.cluster(
method=kw['preprocessing_method'],
threshold=kw['preprocessing_threshold'],
gop=kw['gop'],
cluster_method=kw['cluster_method'],
ref=kw['ref'])
with util.pb(
desc='CORRESPONDENCE CALCULATION',
total=self.width ** 2 / 2) as pb:
for (i, tA), (j, tB) in util.multicombinations2(
enumerate(self.cols)):
pb.update(1)
log.info("Calculating alignments for pair {0} / {1}.".format(
tA, tB))
corrdist[tA, tB] = defaultdict(float)
for mode, gop, scale in kw['modes']:
pairs = self.pairs[tA, tB]
if kw['subset']:
pairs = [
pair for pair in pairs if pair in
self.subsets[tA, tB]]