Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
s[i][j][0] = s[i][j][0].replace("/", "&slash;")
s[i][j] = "/".join(s[i][j])
s[i] = " ".join(s[i])
s = "\n".join(s)
s = TaggedString(unicode(s), format, language=kwargs.get("language", self.language))
return s
#--- TAGGED STRING ---------------------------------------------------------------------------------
# Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes.
# The pattern.text.tree.Text class uses this attribute to determine the token format and
# transform the tagged string to a parse tree of nested Sentence, Chunk and Word objects.
TOKENS = "tokens"
class TaggedString(unicode):
def __new__(self, string, tags=["word"], language=None):
""" Unicode string with tags and language attributes.
For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]).
"""
# From a TaggedString:
if isinstance(string, unicode) and hasattr(string, "tags"):
tags, language = string.tags, string.language
# From a TaggedString.split(TOKENS) list:
if isinstance(string, list):
string = [[[x.replace("/", "&slash;") for x in token] for token in s] for s in string]
string = "\n".join(" ".join("/".join(token) for token in s) for s in string)
s = unicode.__new__(self, string)
s.tags = list(tags)
s.language = language
return s
"""Returns an list of tuples of the form (word, POS tag).
Example:
::
[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
('Thursday', 'NNP'), ('morning', 'NN')]
:rtype: list of tuples
"""
if isinstance(self, TextBlob):
return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
else:
return [(Word(word, pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self)
if not PUNCTUATION_REGEX.match(unicode(t))]
def pos_tags(self):
"""Returns an list of tuples of the form (word, POS tag).
Example:
::
[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
('Thursday', 'NNP'), ('morning', 'NN')]
:rtype: list of tuples
"""
if isinstance(self, TextBlob):
return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
else:
return [(Word(word, pos_tag=t), unicode(t))
for word, t in self.pos_tagger.tag(self)
if not PUNCTUATION_REGEX.match(unicode(t))]
def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements, linebreak=r"\n{2,}"):
""" Returns a list of sentences. Each sentence is a space-separated string of tokens (words).
Handles common cases of abbreviations (e.g., etc., ...).
Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence.
Headings without an ending period are inferred by line breaks.
"""
# Handle periods separately.
punctuation = tuple(punctuation.replace(".", ""))
# Handle replacements (contractions).
for a, b in list(replace.items()):
string = re.sub(a, b, string)
# Handle Unicode quotes.
if isinstance(string, unicode):
string = unicode(string).replace("“", " “ ")\
.replace("”", " ” ")\
.replace("‘", " ‘ ")\
.replace("’", " ’ ")\
.replace("'", " ' ")\
.replace('"', ' " ')
# Collapse whitespace.
string = re.sub("\r\n", "\n", string)
string = re.sub(linebreak, " %s " % EOS, string)
string = re.sub(r"\s+", " ", string)
tokens = []
for t in TOKEN.findall(string+" "):
if len(t) > 0:
tail = []
while t.startswith(punctuation) and \
not t in replace:
# Split leading punctuation.
if chunks:
format.extend(("chunk", "preposition"))
if relations:
format.append("relation")
if lemmata:
format.append("lemma")
# Collapse raw list.
# Sentences are separated by newlines, tokens by spaces, tags by slashes.
# Slashes in words are encoded with &slash;
for i in range(len(s)):
for j in range(len(s[i])):
s[i][j][0] = s[i][j][0].replace("/", "&slash;")
s[i][j] = "/".join(s[i][j])
s[i] = " ".join(s[i])
s = "\n".join(s)
s = TaggedString(unicode(s), format, language=kwargs.get("language", self.language))
return s
def parse(s, *args, **kwargs):
""" Returns a tagged Unicode string.
"""
return parser.parse(unicode(s), *args, **kwargs)
def decode_string(v, encoding="utf-8"):
""" Returns the given value as a Unicode string (if possible).
"""
if isinstance(encoding, basestring):
encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore"))
if isinstance(v, binary_type):
for e in encoding:
try:
return v.decode(*e)
except:
pass
return v
return unicode(v)
def polarity(s, **kwargs):
""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
"""
return sentiment(unicode(s), **kwargs)[0]
def parsetree(s, *args, **kwargs):
""" Returns a parsed Text from the given string.
"""
return Text(parse(unicode(s), *args, **kwargs))