Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
try:
h = float(lines[i].split()[1].strip())
r = lines[i + 1].split(' ', 1)[1].strip()
except (ValueError, IndexError) as e:
i += 2
continue
# Skip reviews that don't have any characters
if not len(r):
i += 2
continue
# Skip reviews if they cannot be recognized as English
try:
if not detect(r) == 'en':
i += 2
continue
except LangDetectException:
i += 2
continue
# Now we append the 2-key dict to the end of reviews
reviews.append(dict(hours=h,
review=r))
i += 2 # Increment i by 2 since we need to go to the next
# 2-line couplet
return reviews
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.stem import PorterStemmer
from PIL import Image
from nltk.tokenize import sent_tokenize, word_tokenize
from langdetect import detect
import langdetect as ld
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
PS = PorterStemmer()
MASK_LOC = "images/wordclouds/mymask.png"
LD_EXC = ld.lang_detect_exception.LangDetectException
def wordcloud():
"""
Analysing users' posts,comments and friends data.
Generate wordclouds of commonly used words from users' posts and comments
Find out the most used language in posts and comments
Generate wordcloud of friends' names, most tagged in your posts
"""
loc = input('Enter facebook archive extracted location: ')
if not os.path.isdir(loc):
print("The provided location doesn't seem to be right")
exit(1)
fname = loc+'/comments/comments.json'
paragraphs = re.split(r'(<p></p><p>|\n|</p>|<p>| – |<br><br><br>)+', text)
separated = {script: '' for script in scripts}
# the first language given is the default one
last_language = scripts[0]
last_paragraph = ''
for paragraph in paragraphs:
if paragraph in (r'</p><p>', r'</p>' r'\n', r'<p>', r'<br><br><br>'):
# skip paragraph breaks to prevent misdetection
separated[last_language] += paragraph
last_paragraph = paragraph
continue
# replace any misleading tags left
paragraph_stripped = re.sub(r'(<(/)?strong>)|(<br>)+|&|<a href=".*"">|</a>', ' ', paragraph)
try:
language = detect(paragraph_stripped)
except LangDetectException:
# an exception means no language could be detected
language = last_language
# langdetect maps "Simplified Chinese" to "zh-cn"
# However, we store it as "zh_hans"
if language == "zh-cn":
language = "zh_hans"
if language not in scripts:
# only detect allowed languages, no exceptions
language = last_language
if language != last_language:
# fix html paragraph breaks after language change
logger.debug('supported language detected: ' + language)
if last_paragraph in (r'</p><p>', r'</p>', r'<p>'):
separated[last_language] = re.sub(r'</p><p>$', '', separated[last_language])
separated[language] += r'</p><p>'
# remove useless dashes after language change</p>
def _detect_language(spacy_object):
try:
detected_language = detect_langs(spacy_object.text)[0]
return {"language": str(detected_language.lang), "score": float(detected_language.prob)}
except LangDetectException:
return {"language": "UNKNOWN", "score": 0.0}
def find_out_language(candidate_languages, *args):
candidates = []
for sample in args:
candidate = guess_language(sample)
if candidate != UNKNOWN_LANGUAGE and candidate in candidate_languages:
candidates.append(candidate)
try:
for candidate in detect_langs(sample):
if candidate.lang in candidate_languages:
candidates.append(candidate.lang)
except LangDetectException:
continue
if len(candidates) == 0:
return None
leading_candidate = {
'lang': candidates[0],
'count': candidates.count(candidates[0])
}
for leading_candidate in candidates[1:0]:
if leading_candidate['count'] < candidates.count(candidate):
leading_candidate['lang'] = candidate
leading_candidate['size'] = candidates.count(candidate)
if leading_candidate['lang'] == UNKNOWN_LANGUAGE:
return None
return leading_candidate['lang']