Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _xtokenize_from_raw(self, token, addlocs, addcontexts):
"""
XTokenize the given token by using C{self.raw_xtokenize} to
tokenize its text string. Locations are reconstructed by
searching for each consecutive token in the text string. To
ensure that locations are correctly assigned,
C{self.raw_xtokenize} must make the following guarantee:
- For each subtoken M{t[i]}, the text separating M{t[i-1]}
and M{t[i]} does not contain M{t[i]} as a substring.
This method is intended to be used by subclasses that wish to
implement the C{xtokenize} method based on C{raw_xtokenize}.
"""
assert chktype(1, token, Token)
SUBTOKENS = self._property_names.get('SUBTOKENS', 'SUBTOKENS')
iter = self._xtokenize_from_raw_helper(token, addlocs, addcontexts)
token[SUBTOKENS] = iter- C{log_likelihood_cutoff}: specifies what log-likelihood
value should be taken to indicate convergence. If the
log-likelihod becomes closer to zero than the specified
value, then IIS will terminate. The default value is
C{None}, which indicates that no log-likelihood cutoff
should be used. (type=C{float})
- C{delta_log_likelihood_cutoff}: specifies what change in
log-likelihood should be taken to indicate convergence.
If the log-likelihood changes by less than this value in a
single iteration, then IIS will terminate. The default
value is C{None}, which indicates that no
log-likelihood-change cutoff should be used. (type=C{float})
"""
assert _chktype(1, train_toks, [Token], (Token,))
# Process the keyword arguments.
iter = 20
debug = 0
classes = None
ll_cutoff = lldelta_cutoff = None
acc_cutoff = accdelta_cutoff = None
for (key, val) in kwargs.items():
if key in ('iterations', 'iter'): iter = val
elif key == 'debug': debug = val
elif key == 'classes': classes = val
elif key == 'log_likelihood_cutoff':
ll_cutoff = abs(val)
elif key == 'delta_log_likelihood_cutoff':
lldelta_cutoff = abs(val)
elif key == 'accuracy_cutoff':
acc_cutoff = abs(val)def precision(reference, test):
"""
Given a set of reference values and a set of test values, return
the percentage of test values that appear in the reference set.
In particular, return |C{reference}S{cap}C{test}|/|C{test}|.
If C{test} is empty, then return C{None}.
@type reference: C{Set}
@param reference: A set of reference values.
@type test: C{Set}
@param test: A set of values to compare against the reference set.
@rtype: C{float} or C{None}
"""
assert chktype(1, reference, sets.BaseSet)
assert chktype(2, test, sets.BaseSet)
if len(test) == 0:
return None
else:
return float(len(reference.intersection(test)))/len(test)def accuracy(reference, test):
"""
Given a list of reference values and a corresponding list of test
values, return the percentage of corresponding values that are
equal. In particular, return the percentage of indices
C{0def __init__(self, function, name=None):
"""
Construct a new C{FunctionFeatureDetector} from the given
function.
@param function: The function that this feature detector is based
on. When this feature detector is applied to a labeled
text M{lt}, it will return M{C{func}(lt)}.
@type function: C{LabeledText} -> (any)
@param name: A name for the function used by this feature
detector. This name is used in the string representation
of the feature detector.
"""
assert _chktype(1, function, types.FunctionType,
types.BuiltinFunctionType, types.ClassType)
assert _chktype(2, name, types.NoneType, types.StringType)
self._name = name
self._func = function"""
@rtype: chunk structure
@return: a chunk structure that encodes the chunks in a given
tagged sentence. A chunk is a non-overlapping linguistic
group, such as a noun phrase. The set of chunks
identified in the chunk structure depends on the rules
used to define this C{RegexpChunkParser}.
@type trace: C{int}
@param trace: The level of tracing that should be used when
parsing a text. C{0} will generate no tracing output;
C{1} will generate normal tracing output; and C{2} or
highter will generate verbose tracing output. This value
overrides the trace level value that was given to the
constructor.
"""
assert chktype(1, token, Token)
assert chktype(2, trace, types.NoneType, types.IntType)
SUBTOKENS = self.property('SUBTOKENS')
TREE = self.property('TREE')
if len(token[SUBTOKENS]) == 0:
print 'Warning: parsing empty text'
token[TREE] = Tree(self._top_node, [])
return
# Use the default trace value?
if trace == None: trace = self._trace
# Create the chunkstring, using the same properties as the parser
chunkstr = ChunkString(token[SUBTOKENS], **self.property_names())
# Apply the sequence of rules to the chunkstring.a condition's frequency distribution to its probability
distribution. The function is called with the frequency
distribution as its first argument, the condition as its
second argument (only if C{supply_condition=True}), and
C{factory_args} as its remaining arguments.
@type supply_condition: C{bool}
@param supply_condition: If true, then pass the condition as
the second argument to C{probdist_factory}.
@type factory_args: (any)
@param factory_args: Extra arguments for C{probdist_factory}.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
"""
assert _chktype(1, cfdist, ConditionalFreqDist)
assert _chktype(2, probdist_factory, types.FunctionType,
types.BuiltinFunctionType, types.MethodType,
types.ClassType)
assert _chktype(3, supply_condition, bool)
self._probdist_factory = probdist_factory
self._cfdist = cfdist
self._supply_condition = supply_condition
self._factory_args = factory_args
self._pdists = {}
for c in cfdist.conditions():
if supply_condition:
pdist = probdist_factory(cfdist[c], c, *factory_args)
else:
pdist = probdist_factory(cfdist[c], *factory_args)
self._pdists[c] = pdistdef Nr(self, r, bins=None):
"""
@return: The number of samples with count r.
@rtype: C{int}
@type r: C{int}
@param r: A sample count.
@type bins: C{int}
@param bins: The number of possible sample outcomes. C{bins}
is used to calculate Nr(0). In particular, Nr(0) is
C{bins-self.B()}. If C{bins} is not specified, it
defaults to C{self.B()} (so Nr(0) will be 0).
"""
assert _chktype(1, r, types.IntType)
assert _chktype(2, bins, types.IntType, types.NoneType)
if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative'
# Special case for Nr(0):
if r == 0:
if bins is None: return 0
else: return bins-self.B()
# We have to search the entire distribution to find Nr. Since
# this is an expensive operation, and is likely to be used
# repeatedly, cache the results.
if self._Nr_cache is None:
self._cache_Nr_values()
if r >= len(self._Nr_cache): return 0
return self._Nr_cache[r]like parenthases. E.g., so that in '+', the '+' has scope
over the entire ''; and so that in '', the '|' has
scope over 'NN' and 'IN', but not '<' or '>'.
- Check to make sure the resulting pattern is valid.
@type tag_pattern: C{string}
@param tag_pattern: The tag pattern to convert to a regular
expression pattern.
@raise ValueError: If C{tag_pattern} is not a valid tag pattern.
In particular, C{tag_pattern} should not include braces; and it
should not contain nested or mismatched angle-brackets.
@rtype: C{string}
@return: A regular expression pattern corresponding to
C{tag_pattern}.
"""
assert chktype(1, tag_pattern, types.StringType)
# Clean up the regular expression
tag_pattern = re.sub(r'\s', '', tag_pattern)
tag_pattern = re.sub(r'<', '(<(', tag_pattern)
tag_pattern = re.sub(r'>', ')>)', tag_pattern)
# Check the regular expression
if not _VALID_TAG_PATTERN.match(tag_pattern):
raise ValueError('Bad tag pattern: %s' % tag_pattern)
# Replace "." with _TAGCHAR.
# We have to do this after, since it adds {}[]<>s, which would
# confuse _VALID_TAG_PATTERN.
# PRE doesn't have lookback assertions, so reverse twice, and do
# the pattern backwards (with lookahead assertions). This can be
# made much cleaner once we can switch back to SRE.def classify(self, token):
assert chktype(1, token, Token)
vector = token['FEATURES']
#assert chktype('features', vector, numarray.array([]), SparseArray)
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt != None:
vector = numarray.matrixmultiply(self._Tt, vector)
cluster = self.classify_vectorspace(vector)
token['CLUSTER'] = self.cluster_name(cluster)