Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dois = [v.replace("http://dx.doi.org/", "") for v in source["identifiers"] if "doi.org" in v]
if len(dois) == 0:
self.logger.warn("document without DOI")
elif len(dois) == 1:
doc.update({"doi": dois[0]})
else:
# In 08/2019, various DOI seem to work.
self.logger.warn("document with multiple dois: %s", dois)
doc.update({"doi": dois[0]})
if doc.get("language"):
doc.update({"language": doc.get("language")})
else:
if len(doc["abstract"]) > 20:
result = langdetect.detect(doc["abstract"])
doc["languages"] = [languages.get(alpha2=result).bibliographic]
self.logger.debug("detected %s in abstract (%s)", doc["languages"], doc["abstract"][:40])
# Gather subjects.
subjects = source.get("subjects", []) + source.get("subject_synonyms", []) + source.get("tags", [])
unique_subjects = set(itertools.chain(*[v.split("|") for v in subjects]))
doc.update({"x.subjects": list(unique_subjects)})
# Try date_published, then date_created, then fail.
for key in ("date_published", "date_created"):
if key not in source or not source[key]:
continue
doc.update({
"x.date": source[key][:19] + "Z",
"rft.date": source[key][:10],
})
for script in soup(["script", "style"]):
script.extract() # rip javascript out
try:
self.set_links(soup)
except ValueError:
raise WebpageError('Cannot set links')
try:
self.title = str(soup.title.string)
self.text = str(soup.body.text)
except AttributeError:
raise WebpageError('Cannot get title or text')
try:
self.language = langdetect.detect(self.text)
if not self.language == language:
raise WebpageError("Language doesn't match.")
except langdetect.lang_detect_exception.LangDetectException:
raise WebpageError('Cannot detect language.')
self.title_words = self.text_to_words(self.title, language=self.language)
# convert all white space to sigle space
self.text = ' '.join(
filter(lambda x: not x == '', re.split('\s', self.text)))
# This version do not respond to mutibyte characters
self.summary = self.text[:500]
self.words = self.text_to_words(self.text, language=self.language)
def sanitize_text(self, text):
try:
if detect(text) == 'en':
allow_in_dict = True
else:
allow_in_dict = False
except:
allow_in_dict = False
# remove non-words
sanitized_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split())
self.stop_words = set(stopwords.words('english'))
self.stop_words.update(STOPWORDS)
self.stop_words.update(self.ignored_words)
word_tokens = word_tokenize(sanitized_text)
#filtered_sentence = [w for w in word_tokens if not w in stop_words and len(w) > 1]
# There's no point processing empty comments or comments too short
# to possibly contain an insult.
if comment_len < config["min_comment_length"]:
return False
# Long form comments are far more difficult to process with current
# NLP techniques. Most work is on 1-2 sentences examples. A decent paragraph
# is 6-10 sentences and around 600-1000 characters.
# We want to avoid having essays as part of our dataset.
valid_length = comment_len <= config["max_comment_length"]
# Ignore comments that aren't in a language our model will handle. This
# will very likely just be english ('en')
try:
valid_language = detect(comment) in config["allowed_languages"]
except langdetect.lang_detect_exception.LangDetectException:
logging.error("Comment: '{}' caused error in lang detect".format(comment.encode('utf-8')))
return False
return valid_length and valid_language
for reg in regexes:
if reg.search(line):
os.remove(directory_path + filename)
file_removed = True
files_matching_regexes.append(filename)
break
if file_removed:
break
if file_removed:
precedent_file.close()
continue
# remove english precedents
precedent_file.seek(0)
file_content = precedent_file.read()
if detect(file_content) == 'en':
os.remove(directory_path + filename)
files_in_english.append(filename)
precedent_file.close()
print('')
Log.write('Done filtering precedents')
Log.write('Removed {} file in english'.format(str(len(files_in_english))))
Log.write('Removed {} files without value'.format(str(len(files_matching_regexes))))
return files_in_english, files_matching_regexes
def language(text, override_language=None):
if override_language:
return override_language
try:
return detect(text)
except LangDetectException:
return 'en'
if args.add_summary is not None:
print('add summary: ' + str(args.add_summary))
title = args.add_summary[0]
summary_file = args.add_summary[1]
add_summary(title, summary_file)
return
title = args.delete_by_title
if title is not None:
print('delete by title: ' + title)
delete_by_title(title, args.keep_file)
return
if args.lang_detect:
print(langdetect.detect(args.lang_detect))
return
parser.print_help()
return
def get_language(self, tree):
if self.response and 'content-language' in self.response.headers:
self.lang = self.response.headers['content-language']
if self.lang is None and 'lang' in tree.attrib:
self.lang = tree.attrib['lang']
if self.response and self.lang is None:
self.lang = self.body_blob.detect_language()
if self.lang is None:
self.lang = langdetect.detect(self.body)
def article_meets_posting_requirements(subreddit, website, article_title):
"""
Validates that the article meets all requirements to post the list to Reddit.
The validations below check if:
(1) The article contains a number
(2) The post hasn't been made already
(3) The article title doesn't contain certain pre-defined keywords
(4) The article title is in english (BuzzFeed only)
Returns True if all validations are met. Returns False otherwise.
"""
if website == ArticleType.BuzzFeed:
try:
if not detect(article_title) == 'en':
return False
except lang_detect_exception.LangDetectException:
return False
no_of_elements = get_article_list_count(article_title)
if no_of_elements == 0:
return False
article_title_lowercase = article_title.lower()
if any(words in article_title_lowercase for words in BREAK_WORDS):
return False
if post_previously_made(article_title_lowercase, no_of_elements, subreddit):
return False
return True