Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
df['first_name'] = df['User Name'].apply(
lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
male_prob = agefromname.AgeFromName().get_all_name_male_prob()
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
df_mf.to_csv('emoji_data.csv', index=False)
nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)
corpus = st.CorpusFromParsedDocuments(
df_mf,
parsed_col='parse',
category_col='gender',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()
html = st.produce_scattertext_explorer(
corpus,
category='f',
category_name='Female',
not_category_name='Male',
use_full_doc=True,
term_ranker=OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=(df_mf['User Name']
+ ' (@' + df_mf['Nickname'] + ') '
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text):
idx = IndexStore()
tdf_vals = term_freqs.values
valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
tdf_vals = term_freqs[valid_terms_mask].values
terms = np.array(term_freqs.index)[valid_terms_mask]
lengths = []
fact = CSRMatrixFactory()
for i, t in enumerate(terms):
for tok in t.split():
fact[i, idx.getidx(tok)] = 1
lengths.append(len(t.split()))
lengths = np.array(lengths)
mat = fact.get_csr_matrix()
coocs = lengths - (mat * mat.T)
pairs = np.argwhere(coocs == 0).T
pairs = self._limit_to_non_identical_terms(pairs)
pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
category_idx_store = IndexStore()
mX_factory = CSRMatrixFactory()
for doci, (category, text) in enumerate(category_text_iter):
y.append(category_idx_store.getidx(category))
term_freq = Counter()
for sent in text.strip(string.punctuation).lower().split('\n'):
unigrams = []
for tok in sent.strip().split():
unigrams.append(tok)
bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
for term in unigrams + bigrams:
term_freq[term_idx_store.getidx(term)] += 1
for word_idx, freq in term_freq.items():
X_factory[doci, word_idx] = freq
metadata_idx_store = IndexStore()
return TermDocMatrix(X=X_factory.get_csr_matrix(),
mX=mX_factory.get_csr_matrix(),
y=np.array(y),
term_idx_store=term_idx_store,
metadata_idx_store=metadata_idx_store,
category_idx_store=category_idx_store)
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus())
html = produce_frequency_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
term_scorer=HedgesR(corpus),
metadata=convention_df['speaker'],
grey_threshold=0
)
file_name = 'demo_hedges_r.html'
import scattertext as st
df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)
corpus = st.CorpusFromParsedDocuments(
df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=3
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp
nlp = whitespace_nlp
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
width_in_pixels=1000,
metadata=convention_df['speaker'])
open('./demo_without_spacy.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_without_spacy.html in Chrome or Firefox.')
def main():
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
html = word_similarity_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
target_term='jobs',
minimum_term_frequency=5,
width_in_pixels=1000,
metadata=convention_df['speaker'],
alpha=0.01,
max_p_val=0.1,
save_svg_button=True)
open('./demo_similarity.html', 'wb').write(html.encode('utf-8'))
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import umap
import scattertext as st
from scipy.sparse.linalg import svds
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
'x': projection_raw[0],
'y': projection_raw[1]}).set_index('term')
category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.Scalers import scale
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
def zero_centered_scale(ar):
ar[ar > 0] = scale(ar[ar > 0])
ar[ar < 0] = -scale(-ar[ar < 0])
return (ar + 1) / 2.
frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
def scale(ar):
return (ar - ar.min()) / (ar.max() - ar.min())
def zero_centered_scale(ar):
ar[ar > 0] = scale(ar[ar > 0])
ar[ar < 0] = -scale(-ar[ar < 0])
return (ar + 1) / 2.