How to use the scattertext.CorpusFromParsedDocuments function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_emoji.py View on Github external
with ZipFile(io.BytesIO(urllib.request.urlopen(
			'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
	).read())) as zf:
		df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
	df['first_name'] = df['User Name'].apply(
		lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
	male_prob = agefromname.AgeFromName().get_all_name_male_prob()
	df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
	df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
	df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
	df_mf.to_csv('emoji_data.csv', index=False)

nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)

corpus = st.CorpusFromParsedDocuments(
	df_mf,
	parsed_col='parse',
	category_col='gender',
	feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

html = st.produce_scattertext_explorer(
	corpus,
	category='f',
	category_name='Female',
	not_category_name='Male',
	use_full_doc=True,
	term_ranker=OncePerDocFrequencyRanker,
	sort_by_dist=False,
	metadata=(df_mf['User Name']
	          + ' (@' + df_mf['Nickname'] + ') '
github JasonKessler / scattertext / demo_chinese.py View on Github external
def main():
	df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv')
	df['text'] = df['text'].apply(chinese_nlp)
	corpus = CorpusFromParsedDocuments(df,
	                                   category_col='novel',
	                                   parsed_col='text').build()
	html = produce_scattertext_explorer(corpus,
	                                    category='Tale of Two Cities',
	                                    category_name='Tale of Two Cities',
	                                    not_category_name='Ulysses',
	                                    width_in_pixels=1000,
	                                    metadata=df['novel'],
	                                    asian_mode=True)
	open('./demo_chinese.html', 'w').write(html)
	print('Open ./demo_chinese.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_corpus_visualization.py View on Github external
def make_political_corpus():
	clean = clean_function_factory()
	get_speaker_name = speaker_name_factory()
	data = []
	for party, speech in iter_party_speech_pairs():
		cleaned_speech = clean(speech)
		speaker_name = get_speaker_name(speech)
		if cleaned_speech and cleaned_speech != '' and speaker_name != '':
			parsed_speech = fast_but_crap_nlp(cleaned_speech)
			data.append({'party': party,
			             'text': parsed_speech,
			             'speaker': speaker_name})
	source_df = pd.DataFrame(data)
	corpus = CorpusFromParsedDocuments(source_df,
	                                   category_col='party',
	                                   parsed_col='text').build()
	return corpus, source_df
github JasonKessler / scattertext / demo_bow_pca.py View on Github external
from sklearn.decomposition import TruncatedSVD

import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)

corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus().select(ClassPercentageCompactor(term_count=3)))


html = st.produce_projection_explorer(corpus,
                                      embeddings=corpus.get_term_doc_mat(),
                                      projection_model=TruncatedSVD(n_components=30, n_iter=10),
                                      x_dim=0,
                                      y_dim=1,
                                      category='democrat',
                                      category_name='Democratic',
                                      not_category_name='Republican',
                                      metadata=convention_df.speaker,
                                      width_in_pixels=1000)
github JasonKessler / scattertext / demo_empath.py View on Github external
def main():
	convention_df = SampleCorpora.ConventionData2012.get_data()
	feat_builder = FeatsFromOnlyEmpath()
	corpus = CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='text',
	                                   feats_from_spacy_doc=feat_builder).build()
	html = produce_scattertext_explorer(corpus,
	                                    category='democrat',
	                                    category_name='Democratic',
	                                    not_category_name='Republican',
	                                    width_in_pixels=1000,
	                                    metadata=convention_df['speaker'],
	                                    use_non_text_features=True,
	                                    use_full_doc=True,
	                                    topic_model_term_lists=feat_builder.get_top_model_term_lists())
	open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8'))
	print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_compact.py View on Github external
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    max_overlapping=3
)
open('./demo_compact.html', 'w').write(html)
print('open ./demo_compact.html in Chrome')
github JasonKessler / scattertext / demo_names.py View on Github external
import scattertext as st
import spacy

nlp = spacy.load('en')

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: list(nlp.pipe(df.text))
)

corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=st.SpacyEntities(entity_types_to_use=['NAME', 'LOC'])
).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    max_overlapping=10,
    max_docs_per_category=0
github JasonKessler / scattertext / demo_umap_documents.py View on Github external
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import umap
import scattertext as st
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])

embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
                           'x': projection_raw[0],
                           'y': projection_raw[1]}).set_index('term')

category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
html = st.produce_pca_explorer(corpus,
                               category=category,
github JasonKessler / scattertext / demo_embeddings_pca.py View on Github external
import scattertext as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus()
          .remove_infrequent_words(minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k = 3, maxiter=20000, which='LM')

x_dim = 0; y_dim = 1
projection = pd.DataFrame({'term':corpus.get_terms(),
                           'x':U.T[x_dim],
                           'y':U.T[y_dim]}).set_index('term')
html = st.produce_pca_explorer(corpus,
                               category='democrat',
                               category_name='Democratic',
                               not_category_name='Republican',