Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
tracer[char][2] += 1
bor += 1
states[name] = [ret, inn, bor]
# calculate the scores
ret = sum([c[0] for c in tracer.values()])
inn = sum([c[1] for c in tracer.values()])
tra = sum([c[2] for c in tracer.values()])
ipn = inn / len(acs)
tpn = tra / len(acs)
total2 = ipn + tpn
log.info("Innovations: {0}, {1:.2f}, {2:.2f}".format(inn, ipn, ipn / total2))
log.info("Transferred: {0}, {1:.2f}, {2:.2f}".format(tra, tpn, tpn / total2))
if return_dists:
leaves = []
nodes = []
for node in [n for n in tree.getNodeNames() if n != 'root']:
innovations = states[node][1] + states[node][2]
if node in tree.taxa:
leaves += [innovations]
else:
nodes += [innovations]
# evaluate using mwu
p, r = sps.mstats.kruskalwallis(leaves, nodes)
return p, r
tasks = (self.width ** 2) / 2
if method == 'markov':
seqs, pros, weights = {}, {}, {}
# get a random distribution for all pairs
sample = random.sample(
[(i, j) for i in range(kw['rands']) for j in
range(kw['rands'])], kw['runs'])
with util.pb(
desc='SEQUENCE GENERATION',
total=len(self.cols)) as progress:
for i, taxon in enumerate(self.cols):
progress.update(1)
log.info("Analyzing taxon {0}.".format(taxon))
tokens = self.get_list(col=taxon, entry="tokens", flat=True)
prostrings = self.get_list(
col=taxon, entry=self._prostrings, flat=True)
m = MCPhon(tokens, True, prostrings)
words = []
j, k = 0, 0
while j < kw['rands']:
s = m.get_string(new=False)
if s in words:
k += 1
if k > kw['limit']:
break
else:
j += 1
words += [s]
if len(words) < kw['rands']:
self.sep,
strip_lines = False # this is of crucial importance, otherwise
)
# columns that have language data
language_indices = []
concept_id = 0
# first row must be the header in the input; TODO: add more functionality
header = spreadsheet[0]
log.info('%s' % header[0:10])
for i, cell in enumerate(header):
cell = cell.strip()
log.info('%s' % cell)
if cell == self.meanings:
concept_id = i
if self.language_id in cell:
language_indices.append(i)
matrix_header = []
matrix_header.append(header[concept_id])
for i in language_indices:
matrix_header.append(header[i].replace(self.language_id, "").strip())
self.matrix.append(matrix_header)
# append the concepts and words in languages and append the rows (skip header row)
for i in range(1, len(spreadsheet)):
matrix_row = []
# if the concept cell is empty skip if flagged
if spreadsheet[i][concept_id] == "" and self.skip_empty_concepts:
This is an iterator object and it yields the indices of a given
concept, the matrix, and the concept.
"""
# currently, there are no defaults XXX
kw = dict(
defaults=False,
external_scorer=False, # external scoring function
)
kw.update(keywords)
function = self._distance_method(
method, scale=scale, factor=factor,
restricted_chars=restricted_chars, mode=mode, gop=gop,
restriction=restriction, external_scorer=kw['external_scorer'])
concepts = [concept] if concept else sorted(self.rows)
for c in concepts:
log.info("Analyzing words for concept <{0}>.".format(c))
indices = self.get_list(row=c, flat=True)
matrix = []
for idxA, idxB in util.combinations2(indices):
try:
d = function(idxA, idxB)
except ZeroDivisionError:
log.warning(
"Encountered Zero-Division for the comparison of "
"{0} ({2}) and {1} ({3})".format(
''.join(self[idxA, self._segments]),
''.join(self[idxB, self._segments]),
idxA, idxB
))
d = 100
matrix += [d]
matrix = misc.squareform(matrix)
The information in such a file is automatically converted into a
scoring dictionary (see :evobib:`List2012b` for details).
Based on the information provided by the files, a dictionary for the
conversion of IPA-characters to sound classes and a scoring dictionary are
created and stored as a binary. The model can be loaded with help of the
:py:class:`~lingpy.data.model.Model` class and used in the various classes
and functions provided by the library.
See also
--------
lingpy.data.model.Model
compile_dvt
"""
log.info("Compiling model <" + model + ">...")
# get the path to the models
new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps)
log.debug("Model-Path: %s" % new_path)
# load the sound classes
sound_classes = _import_sound_classes(new_path('converter'))
# dump the data
cache.dump(sound_classes, model + '.converter')
log.info("... successfully created the converter.")
# try to load the scoring function or the score tree
scorer = False
if os.path.isfile(new_path('matrix')):
elif data in ['groups', 'cluster']:
if 'distances' not in wordlist._meta:
distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
else:
distances = wordlist._meta['distances']
if 'groups' in wordlist._meta and not keywords['force']:
logger.warn(
"Distance matrix has already been calculated, "
"force overwrite by "
"setting 'force' to 'True'.")
return
wordlist._meta['groups'] = clustering.matrix2groups(
keywords['threshold'], distances, these_taxa,
keywords['cluster_method'])
log.info("Successfully calculated {0}.".format(data))
:py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and
:py:obj:`rcParams['tones']`. Their core purpose is to guide the
tokenization of IPA strings (cf.
:py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the
variables, one simply has to change the text files :file:`diacritics`,
:file:`tones`, and
:file:`vowels` in the :file:`data/models/dv` directory. The structure of
these files is fairly simple: Each line contains a vowel or a diacritic
character, whereas diacritics are preceded by a dash.
See also
--------
lingpy.data.model.Model
lingpy.data.derive.compile_model
"""
log.info("Compiling diacritics and vowels...")
# get the path to the models
if not path:
file_path = util.data_path('models', 'dvt')
elif path in ['evolaemp', 'el']:
file_path = util.data_path('models', 'dvt_el')
else:
file_path = path
def _read_string(name):
# normalize stuff
# TODO: this is potentially dangerous and it is important to decide whether
# TODO: switching to NFD might not be a better choice
return util.read_text_file(
os.path.join(file_path, name), normalize='NFC').replace('\n', '')
"""
Converts a file directly output from starling to LingPy-QLC format.
"""
cleant = clean_taxnames or identity
data = csv2list(filename)
# check for strange chars in data due to notepad errors
data[0][0] = data[0][0].replace('\ufeff', '')
# get the header
header = data[0]
# debugging
if debug:
error = False
log.info("Header line has length {0}.".format(len(header)))
for line in data[1:]:
if len(line) != len(header): # pragma: no cover
log.error("Error for item {0} with length {1}, expected {2}.".format(
'/'.join(line[0:2]), len(line), len(header)))
error = True
if error: # pragma: no cover
log.error("Errors were found, aborting function call.")
return
else:
log.info("Everything went fine, carrying on with function call.")
# determine language names in header
taxa = []
for i in range(len(header) - 1):
prev = header[i]
post = header[i + 1]
Function creates confidence scores for a given set of alignments.
Parameters
----------
scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
A *ScoreDict* object which gives similarity scores for all segments in
the alignment.
ref : str (default="lexstatid")
The reference entry-type, referring to the cognate-set to be used for
the analysis.
gap_weight : {loat} (default=1.0)
Determine the weight assigned to matches containing gaps.
"""
corrs = confidence.get_confidence(self, scorer, ref, gap_weight)
log.info("Successfully calculated confidence values for alignments.")
return corrs