Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import csv
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from azureml.logging import get_azureml_logger
# initialize the logger
run_logger = get_azureml_logger()
run_logger.log('amlrealworld.ChurnPrediction.CATelcoCustomerChurnModeling','true')
with Package.open_package('CATelcoCustomerChurnTrainingSample.dprep') as pkg:
df = pkg.dataflows[0].get_dataframe(spark=False)
columns_to_encode = list(df.select_dtypes(include=['category','object']))
for column_to_encode in columns_to_encode:
dummies = pd.get_dummies(df[column_to_encode])
one_hot_col_names = []
for col_name in list(dummies.columns):
one_hot_col_names.append(column_to_encode + '_' + col_name)
dummies.columns = one_hot_col_names
df = df.drop(column_to_encode, axis=1)
df = df.join(dummies)
model = GaussianNB()
random_seed = 42
train, test = train_test_split(df, random_state = random_seed, test_size = 0.3)
wordcloud = vertices.map(lambda l: dataprep.clean_vertex(l))
vertices.cache()
vertices = revsents.map(lambda l: dataprep.clean_vertex(l))
revsents = revsents.cache()
graph = vertices.map(lambda ver: dataprep.create_adjlist(ver, allvertices))
graph = wordcloud.map(lambda ver: dataprep.create_adjlist(ver, vert_cache)).filter(lambda l: len(l[1]) > 0).cache() #Remove this filter if not much use
rank = graph.map(lambda (vert, neighbors): (vert, 0.15))
vertices = sc.textFile(path).flatMap(lambda review: dataprep.create_vertices(review))
wordcloud = vertices.map(lambda l: dataprep.clean_vertex(l))
revsents = sc.textFile(path).flatMap(lambda review: dataprep.create_vertices(review))
def documents_from_file(filename):
with (bz2.open(filename, 'rt', encoding="UTF-8")) as f:
for line in f:
try:
yield Document.from_json(json.loads(line))
except ValueError as e:
logging.warning("Error while reading document (%s); skipping", e)
while True:
try:
yield pickle.load(p.stdout)
doc_count += 1
except EOFError:
break
assert doc_count >= 400, "Number of documents (%d) was less than expected (400) from %s. File is likely incomplete" % (
doc_count, labeled_and_featurized_tokens_path
)
else:
logging.warning(
"Could not find %s, recreating it", labeled_and_featurized_tokens_path
)
nonlocal token_stats
if token_stats is None:
token_stats = TokenStatistics(os.path.join(dirname, "all.tokenstats2.gz"))
temp_labeled_and_featurized_tokens_path = \
labeled_and_featurized_tokens_path + ".%d.temp" % os.getpid()
with multiprocessing_generator.ParallelGenerator(
labeled_tokens(), max_lookahead=64
) as docs:
docs = docs_with_normalized_features(
model_settings.max_page_number,
model_settings.token_hash_size,
model_settings.font_hash_size,
token_stats,
docs)
with gzip.open(temp_labeled_and_featurized_tokens_path, "wb") as f:
for doc in docs:
yield doc
pickle.dump(doc, f)
def from_json(cls, json_doc):
doc_id = json_doc["docId"]
pages = [Page.from_json(p) for p in json_doc.get("pages", [])]
return Document(doc_id=doc_id, pages=pages)