Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# create an index
if not hasattr(self, 'freqs'):
self.chars = set()
self.freqs = {}
for taxon in self.cols:
self.freqs[taxon] = Counter()
for word in self.get_list(
col=taxon, entry=self._numbers, flat=True):
self.freqs[taxon].update(word)
self.chars = self.chars.union(self.freqs[taxon].keys())
self.rchars = sorted(
set(char.split('.', 1)[1] for char in self.chars))
self.chars = sorted(self.chars) \
+ [charstring(i + 1) for i in range(self.width)]
if not self.chars:
raise ValueError("Your input data contains no entries!")
self.bad_chars = [char for char in self.chars if char[2] == '0']
if len(self.bad_chars) / len(self.chars) > \
rcParams['lexstat_bad_chars_limit']:
raise ValueError(
"{0:.0f}% of the unique characters in your word "
"list are not "
"recognized by {1}. You should set check=True!".format(
100 * len(self.bad_chars) / len(self.chars),
util.PROG))
if not hasattr(self, "scorer"):
self._meta['scorer'] = {}
# create a scoring dictionary
threshold,
new_nums,
new_weights,
new_pros,
gop,
scale,
kw['factor'],
self.bscorer,
mode,
kw['restricted_chars'])
# change representation of gaps
for (a, b), d in corrs.items():
# XXX check for bias XXX
if a == '-':
a = util.charstring(i + 1)
elif b == '-':
b = util.charstring(j + 1)
corrdist[tA, tB][a, b] += d / float(len(kw['modes']))
return corrdist
[self[pair, self._weights] for pair in pairs],
[self[pair, self._prostrings] for pair in pairs],
gop,
scale,
kw['factor'],
self.bscorer,
mode,
kw['restricted_chars'])
# change representation of gaps
for (a, b), d in corrs.items():
# XXX check for bias XXX
if a == '-':
a = charstring(i + 1)
elif b == '-':
b = charstring(j + 1)
corrdist[tA, tB][a, b] += d / float(len(kw['modes']))
return corrdist
def lexstat_align(x, y):
return calign.align_pair(
self[x, self._numbers],
self[y, self._numbers],
[self.cscorer[charstring(self[y, 'langid']), n] for n in
self[x, self._numbers]],
[self.cscorer[charstring(self[x, 'langid']), n] for n in
self[y, self._numbers]],
self[x, self._prostrings],
self[y, self._prostrings],
1,
kw['scale'],
kw['factor'],
self.cscorer,
kw['mode'],
kw['restricted_chars'], 1
)[2]
new_weights,
new_pros,
gop,
scale,
kw['factor'],
self.bscorer,
mode,
kw['restricted_chars'])
# change representation of gaps
for (a, b), d in corrs.items():
# XXX check for bias XXX
if a == '-':
a = util.charstring(i + 1)
elif b == '-':
b = util.charstring(j + 1)
corrdist[tA, tB][a, b] += d / float(len(kw['modes']))
return corrdist
# get the correspondence distribution
self._corrdist = self._get_corrdist(**kw)
# get the random distribution
self._randist = self._get_randist(**kw)
# get the average gop
gop = sum([m[1] for m in kw['modes']]) / len(kw['modes'])
# create the new scoring matrix
matrix = [[c for c in line] for line in self.bscorer.matrix]
char_dict = self.bscorer.chars2int
for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)):
for charA, charB in product(
list(self.freqs[tA]) + [charstring(i + 1)],
list(self.freqs[tB]) + [charstring(j + 1)]
):
exp = self._randist.get(
(tA, tB), {}).get((charA, charB), False)
att = self._corrdist.get(
(tA, tB), {}).get((charA, charB), False)
# in the following we follow the former lexstat protocol
if att <= kw['smooth'] and i != j:
att = False
if att and exp:
score = np.log2((att ** 2) / (exp ** 2))
elif att and not exp:
score = np.log2((att ** 2) / kw['unexpected'])
elif exp and not att:
score = kw['unattested'] # XXX gop ???
gop,
scale,
kw['factor'],
self.bscorer,
mode,
kw['restricted_chars'])
# change representation of gaps
for a, b in list(corrs.keys()):
# get the correspondence count
d = corrs[a, b] * self._included[tA, tB] / included
# XXX check XXX* len(self.pairs[tA,tB]) / runs
# check for gaps
if a == '-':
a = charstring(i + 1)
elif b == '-':
b = charstring(j + 1)
corrdist[tA, tB][a, b] += d / len(kw['modes'])
return corrdist