Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _xtokenize_from_raw(self, token, addlocs, addcontexts):
"""
XTokenize the given token by using C{self.raw_xtokenize} to
tokenize its text string. Locations are reconstructed by
searching for each consecutive token in the text string. To
ensure that locations are correctly assigned,
C{self.raw_xtokenize} must make the following guarantee:
- For each subtoken M{t[i]}, the text separating M{t[i-1]}
and M{t[i]} does not contain M{t[i]} as a substring.
This method is intended to be used by subclasses that wish to
implement the C{xtokenize} method based on C{raw_xtokenize}.
"""
assert chktype(1, token, Token)
SUBTOKENS = self._property_names.get('SUBTOKENS', 'SUBTOKENS')
iter = self._xtokenize_from_raw_helper(token, addlocs, addcontexts)
token[SUBTOKENS] = iter
- C{log_likelihood_cutoff}: specifies what log-likelihood
value should be taken to indicate convergence. If the
log-likelihod becomes closer to zero than the specified
value, then IIS will terminate. The default value is
C{None}, which indicates that no log-likelihood cutoff
should be used. (type=C{float})
- C{delta_log_likelihood_cutoff}: specifies what change in
log-likelihood should be taken to indicate convergence.
If the log-likelihood changes by less than this value in a
single iteration, then IIS will terminate. The default
value is C{None}, which indicates that no
log-likelihood-change cutoff should be used. (type=C{float})
"""
assert _chktype(1, train_toks, [Token], (Token,))
# Process the keyword arguments.
iter = 20
debug = 0
classes = None
ll_cutoff = lldelta_cutoff = None
acc_cutoff = accdelta_cutoff = None
for (key, val) in kwargs.items():
if key in ('iterations', 'iter'): iter = val
elif key == 'debug': debug = val
elif key == 'classes': classes = val
elif key == 'log_likelihood_cutoff':
ll_cutoff = abs(val)
elif key == 'delta_log_likelihood_cutoff':
lldelta_cutoff = abs(val)
elif key == 'accuracy_cutoff':
acc_cutoff = abs(val)
def precision(reference, test):
"""
Given a set of reference values and a set of test values, return
the percentage of test values that appear in the reference set.
In particular, return |C{reference}S{cap}C{test}|/|C{test}|.
If C{test} is empty, then return C{None}.
@type reference: C{Set}
@param reference: A set of reference values.
@type test: C{Set}
@param test: A set of values to compare against the reference set.
@rtype: C{float} or C{None}
"""
assert chktype(1, reference, sets.BaseSet)
assert chktype(2, test, sets.BaseSet)
if len(test) == 0:
return None
else:
return float(len(reference.intersection(test)))/len(test)
def accuracy(reference, test):
"""
Given a list of reference values and a corresponding list of test
values, return the percentage of corresponding values that are
equal. In particular, return the percentage of indices
C{0
def __init__(self, function, name=None):
"""
Construct a new C{FunctionFeatureDetector} from the given
function.
@param function: The function that this feature detector is based
on. When this feature detector is applied to a labeled
text M{lt}, it will return M{C{func}(lt)}.
@type function: C{LabeledText} -> (any)
@param name: A name for the function used by this feature
detector. This name is used in the string representation
of the feature detector.
"""
assert _chktype(1, function, types.FunctionType,
types.BuiltinFunctionType, types.ClassType)
assert _chktype(2, name, types.NoneType, types.StringType)
self._name = name
self._func = function
"""
@rtype: chunk structure
@return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this C{RegexpChunkParser}.
@type trace: C{int}
@param trace: The level of tracing that should be used when
parsing a text. C{0} will generate no tracing output;
C{1} will generate normal tracing output; and C{2} or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
"""
assert chktype(1, token, Token)
assert chktype(2, trace, types.NoneType, types.IntType)
SUBTOKENS = self.property('SUBTOKENS')
TREE = self.property('TREE')
if len(token[SUBTOKENS]) == 0:
print 'Warning: parsing empty text'
token[TREE] = Tree(self._top_node, [])
return
# Use the default trace value?
if trace == None: trace = self._trace
# Create the chunkstring, using the same properties as the parser
chunkstr = ChunkString(token[SUBTOKENS], **self.property_names())
# Apply the sequence of rules to the chunkstring.
a condition's frequency distribution to its probability
distribution. The function is called with the frequency
distribution as its first argument, the condition as its
second argument (only if C{supply_condition=True}), and
C{factory_args} as its remaining arguments.
@type supply_condition: C{bool}
@param supply_condition: If true, then pass the condition as
the second argument to C{probdist_factory}.
@type factory_args: (any)
@param factory_args: Extra arguments for C{probdist_factory}.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
"""
assert _chktype(1, cfdist, ConditionalFreqDist)
assert _chktype(2, probdist_factory, types.FunctionType,
types.BuiltinFunctionType, types.MethodType,
types.ClassType)
assert _chktype(3, supply_condition, bool)
self._probdist_factory = probdist_factory
self._cfdist = cfdist
self._supply_condition = supply_condition
self._factory_args = factory_args
self._pdists = {}
for c in cfdist.conditions():
if supply_condition:
pdist = probdist_factory(cfdist[c], c, *factory_args)
else:
pdist = probdist_factory(cfdist[c], *factory_args)
self._pdists[c] = pdist
def Nr(self, r, bins=None):
"""
@return: The number of samples with count r.
@rtype: C{int}
@type r: C{int}
@param r: A sample count.
@type bins: C{int}
@param bins: The number of possible sample outcomes. C{bins}
is used to calculate Nr(0). In particular, Nr(0) is
C{bins-self.B()}. If C{bins} is not specified, it
defaults to C{self.B()} (so Nr(0) will be 0).
"""
assert _chktype(1, r, types.IntType)
assert _chktype(2, bins, types.IntType, types.NoneType)
if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative'
# Special case for Nr(0):
if r == 0:
if bins is None: return 0
else: return bins-self.B()
# We have to search the entire distribution to find Nr. Since
# this is an expensive operation, and is likely to be used
# repeatedly, cache the results.
if self._Nr_cache is None:
self._cache_Nr_values()
if r >= len(self._Nr_cache): return 0
return self._Nr_cache[r]
like parenthases. E.g., so that in '+', the '+' has scope
over the entire ''; and so that in '', the '|' has
scope over 'NN' and 'IN', but not '<' or '>'.
- Check to make sure the resulting pattern is valid.
@type tag_pattern: C{string}
@param tag_pattern: The tag pattern to convert to a regular
expression pattern.
@raise ValueError: If C{tag_pattern} is not a valid tag pattern.
In particular, C{tag_pattern} should not include braces; and it
should not contain nested or mismatched angle-brackets.
@rtype: C{string}
@return: A regular expression pattern corresponding to
C{tag_pattern}.
"""
assert chktype(1, tag_pattern, types.StringType)
# Clean up the regular expression
tag_pattern = re.sub(r'\s', '', tag_pattern)
tag_pattern = re.sub(r'<', '(<(', tag_pattern)
tag_pattern = re.sub(r'>', ')>)', tag_pattern)
# Check the regular expression
if not _VALID_TAG_PATTERN.match(tag_pattern):
raise ValueError('Bad tag pattern: %s' % tag_pattern)
# Replace "." with _TAGCHAR.
# We have to do this after, since it adds {}[]<>s, which would
# confuse _VALID_TAG_PATTERN.
# PRE doesn't have lookback assertions, so reverse twice, and do
# the pattern backwards (with lookahead assertions). This can be
# made much cleaner once we can switch back to SRE.
def classify(self, token):
assert chktype(1, token, Token)
vector = token['FEATURES']
#assert chktype('features', vector, numarray.array([]), SparseArray)
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt != None:
vector = numarray.matrixmultiply(self._Tt, vector)
cluster = self.classify_vectorspace(vector)
token['CLUSTER'] = self.cluster_name(cluster)