Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def is_valid_line(line):
if 'contact' in line.lower():
return False
if not re.search(r'\.|\!|\,', line) and len(line.split()) > 15:
return False
if re.search(r'\$\s*\d+', line):
return False
try:
if langdetect.detect(line) != 'en':
return False
except langdetect.lang_detect_exception.LangDetectException:
return True
return True
script.extract() # rip javascript out
try:
self.set_links(soup)
except ValueError:
raise WebpageError('Cannot set links')
try:
self.title = str(soup.title.string)
self.text = str(soup.body.text)
except AttributeError:
raise WebpageError('Cannot get title or text')
try:
self.language = langdetect.detect(self.text)
except langdetect.lang_detect_exception.LangDetectException:
raise WebpageError('Cannot detect language.')
self.title_words = self.text_to_words(self.title)
# convert all white space to sigle space
self.text = ' '.join(
filter(lambda x: not x == '', re.split('\s', self.text)))
# This version do not respond to mutibyte characters
self.text = self.remove_non_ascii_character(self.text)
self.summary = self.text[:500]
self.words = self.text_to_words(self.text)
try:
self.title = str(soup.title.string)
self.text = str(soup.body.text)
except AttributeError:
raise WebpageError('Cannot get title or text')
try:
print('webpage.py start detecting language ' + url,
file=sys.stderr)
self.language = langdetect.detect(self.text)
print('webpage.py finish detecting language ' + url,
file=sys.stderr)
if not self.language == language:
raise WebpageError("Language doesn't match.")
except langdetect.lang_detect_exception.LangDetectException:
raise WebpageError('Cannot detect language.')
print('webpage.py start text_to_words for title ' + url,
file=sys.stderr)
self.title_words = self.text_to_words(
self.title, language=self.language)
print('webpage.py finish text_to_words for title ' + url,
file=sys.stderr)
# convert all white space to sigle space
# self.text = ' '.join(
# filter(lambda x: not x == '', re.split('\s', self.text)))
# This version do not respond to mutibyte characters
self.summary = self.text[:500]
print('webpage.py start text_to_words for text ' + url,
file=sys.stderr)
media_id,
media_code
))
action_scheduler.add_like(media_id, media_code)
# end if
except ActionReservoirFullError:
logging.getLogger(pystr.LOGGER).error(pystr.ERROR_RESERVOIR_FULL)
exit()
pass
except ActionAlreadyExists:
logging.getLogger(pystr.LOGGER).error(pystr.ERROR_COMMENT_ALREADY_DB.format(
media_id))
pass
# end try
# end if
except langdetect.lang_detect_exception.LangDetectException:
pass
# end try
def validate_language(self, text: str) -> bool:
"""
Validate if the language of the text is one of valid languages.
"""
if not self.valid_languages:
return True
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None
if lang in self.valid_languages:
return True
else:
return False
# There's no point processing empty comments or comments too short
# to possibly contain an insult.
if comment_len < config["min_comment_length"]:
return False
# Long form comments are far more difficult to process with current
# NLP techniques. Most work is on 1-2 sentences examples. A decent paragraph
# is 6-10 sentences and around 600-1000 characters.
# We want to avoid having essays as part of our dataset.
valid_length = comment_len <= config["max_comment_length"]
# Ignore comments that aren't in a language our model will handle. This
# will very likely just be english ('en')
try:
valid_language = detect(comment) in config["allowed_languages"]
except langdetect.lang_detect_exception.LangDetectException:
logging.error("Comment: '{}' caused error in lang detect".format(comment.encode('utf-8')))
return False
return valid_length and valid_language
def get_lang(text):
resu = None
try:
langs = langdetect.detect_langs(text)
for language in langs:
if language.lang == "ru":
language.prob += 0.2
if resu is None or resu < language:
resu = language
except langdetect.lang_detect_exception.LangDetectException:
pass
if resu is None:
return "ru"
return resu.lang
def process_normalized_lang_map(text):
try:
lang_map = {l.lang: l.prob
for l in langdetect.detect_langs(text or "")}
except langdetect.lang_detect_exception.LangDetectException:
lang_map = {}
normalized_lang_map = defaultdict(lambda: 0.0)
for lang in ALL_LANGS:
norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang)
normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0)
return normalized_lang_map
def POSTag(text, sent_split=True, tolist=True):
words=[]
if text!='':
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = "undetermined"
if (lang == "zh-cn"): #If text is chinese segment, else leave it
#########
if sent_split:
annotators = ['tokenize', 'ssplit', 'pos']
with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
ann = client.annotate(text)
words = [[(token.word,token.pos) for token in sent.token] for sent in ann.sentence]
segmented_list = [' '.join(['#'.join(posted) for posted in wordlist]) for wordlist in words]
segmented = '\n'.join(segmented_list)
else:
annotators = ['tokenize','pos']
with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
ann = client.annotate(text)
words = [(token.word, token.pos) for token in ann.sentencelessToken]
segmented = ' '.join(['#'.join(posted) for posted in words])
def language_in_tweet(tweet):
detected_lang = None
try:
detected_lang = detect(tweet['text'])
except lang_detect_exception.LangDetectException:
pass
return any([detected_lang in args])
cp = copy.deepcopy(self)