Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def lemma_from_key(self, key):
# Keys are case sensitive and always lower-case
key = key.lower()
lemma_name, lex_sense = key.split('%')
pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
pos = self._pos_names[int(pos_number)]
# open the key -> synset file if necessary
if self._key_synset_file is None:
self._key_synset_file = self.open('index.sense')
# Find the synset for the lemma.
synset_line = _binary_search_file(self._key_synset_file, key)
if not synset_line:
raise WordNetError("No synset found for key %r" % key)
offset = int(synset_line.split()[1])
synset = self._synset_from_pos_and_offset(pos, offset)
# return the corresponding lemma
for lemma in synset.lemmas:
if lemma.key == key:
return lemma
raise WordNetError("No lemma found for for key %r" % key)
def synset(self, name):
# split name into lemma, part of speech and synset number
lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
synset_index = int(synset_index_str) - 1
# get the offset for this synset
try:
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
except KeyError:
message = 'no lemma %r with part of speech %r'
raise WordNetError(message % (lemma, pos))
except IndexError:
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
message = "lemma %r with part of speech %r has only %i %s"
if n_senses == 1:
tup = lemma, pos, n_senses, "sense"
else:
tup = lemma, pos, n_senses, "senses"
raise WordNetError(message % tup)
# load synset information from the appropriate file
synset = self._synset_from_pos_and_offset(pos, offset)
# some basic sanity checks on loaded attributes
if pos == 's' and synset.pos == 'a':
message = ('adjective satellite requested but only plain '
'adjective found for lemma %r')
synset_index = int(synset_index_str) - 1
# get the offset for this synset
try:
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
except KeyError:
message = 'no lemma %r with part of speech %r'
raise WordNetError(message % (lemma, pos))
except IndexError:
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
message = "lemma %r with part of speech %r has only %i %s"
if n_senses == 1:
tup = lemma, pos, n_senses, "sense"
else:
tup = lemma, pos, n_senses, "senses"
raise WordNetError(message % tup)
# load synset information from the appropriate file
synset = self._synset_from_pos_and_offset(pos, offset)
# some basic sanity checks on loaded attributes
if pos == 's' and synset.pos == 'a':
message = ('adjective satellite requested but only plain '
'adjective found for lemma %r')
raise WordNetError(message % lemma)
assert synset.pos == pos or (pos == 'a' and synset.pos == 's')
# Return the synset object.
return synset
if lemma_number == 0:
synset.frame_ids.append(frame_number)
for lemma in synset.lemmas:
lemma.frame_ids.append(frame_number)
lemma.frame_strings.append(frame_string_fmt %
lemma.name)
# only a specific word in the synset
else:
lemma = synset.lemmas[lemma_number - 1]
lemma.frame_ids.append(frame_number)
lemma.frame_strings.append(frame_string_fmt %
lemma.name)
# raise a more informative error with line text
except ValueError as e:
raise WordNetError('line %r: %s' % (data_file_line, e))
# set sense keys for Lemma objects - note that this has to be
# done afterwards so that the relations are available
for lemma in synset.lemmas:
if synset.pos == ADJ_SAT:
head_lemma = synset.similar_tos()[0].lemmas[0]
head_name = head_lemma.name
head_id = '%02d' % head_lemma._lex_id
else:
head_name = head_id = ''
tup = (lemma.name, WordNetCorpusReader._pos_numbers[synset.pos],
lemma._lexname_index, lemma._lex_id, head_name, head_id)
lemma.key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
# the canonical name is based on the first lemma
lemma_name = synset.lemmas[0].name.lower()
def information_content(synset, ic):
try:
icpos = ic[synset.pos]
except KeyError:
msg = 'Information content file has no entries for part-of-speech: %s'
raise WordNetError(msg % synset.pos)
counts = icpos[synset.offset]
if counts == 0:
return _INF
else:
return -math.log(counts / icpos[0])
the highest information content value. If two nodes have no
explicit common subsumer, assume that they share an artificial
root node that is the hypernym of all explicit roots.
:type synset1: Synset
:param synset1: First input synset.
:type synset2: Synset
:param synset2: Second input synset. Must be the same part of
speech as the first synset.
:type ic: dict
:param ic: an information content object (as returned by ``load_ic()``).
:return: The information content of the two synsets and their most
informative subsumer
"""
if synset1.pos != synset2.pos:
raise WordNetError('Computing the least common subsumer requires ' + \
'%s and %s to have the same part of speech.' % \
(synset1, synset2))
ic1 = information_content(synset1, ic)
ic2 = information_content(synset2, ic)
subsumers = synset1.common_hypernyms(synset2)
if len(subsumers) == 0:
subsumer_ic = 0
else:
subsumer_ic = max(information_content(s, ic) for s in subsumers)
if verbose:
print("> LCS Subsumer by content:", subsumer_ic)
return ic1, ic2, subsumer_ic
_ = [_next_token() for _ in xrange(n_pointers)]
# same as number of synsets
n_senses = int(_next_token())
assert n_synsets == n_senses
# get number of senses ranked according to frequency
_ = int(_next_token())
# get synset offsets
synset_offsets = [int(_next_token()) for _ in xrange(n_synsets)]
# raise more informative error with file name and line number
except (AssertionError, ValueError) as e:
tup = ('index.%s' % suffix), (i + 1), e
raise WordNetError('file %s, line %i: %s' % tup)
# map lemmas and parts of speech to synsets
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
if pos == ADJ:
self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
def lemma(self, name):
# e.g.: '.45_caliber.a.01..45_caliber'
separator = SENSENUM_RE.search(name).start()
synset_name, lemma_name = name[:separator+3], name[separator+4:]
synset = self.synset(synset_name)
for lemma in synset.lemmas:
if lemma.name == lemma_name:
return lemma
raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))