Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def toDF(self):
"""
This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame.
"""
if isinstance(self.eval_data, DataFrame):
return self.eval_data
if isinstance(self.eval_data, py4j.java_gateway.JavaObject):
self.eval_data = _java2py(
SparkContext._active_spark_context, self.eval_data)
if isinstance(self.eval_data, Matrix):
self.eval_data = self.eval_data.toDF()
return self.eval_data
self.eval_data = matrix.sparkSession.createDataFrame(self.toPandas())
return self.eval_data
| Java|2013| 30000|
+------+----+--------+
"""
import pandas
minPandasVersion = '0.7.1'
if LooseVersion(pandas.__version__) < LooseVersion(minPandasVersion):
raise ImportError('Pandas installed but version is {}, {} required'
.format(pandas.__version__, minPandasVersion))
# Do a null aggregation to retrieve the keys first (should be no computation)
key_cols = grouped_data.agg({}).columns
if not cols:
# Extract the full column list with the parent df
javaDFName = "org$apache$spark$sql$RelationalGroupedDataset$$df"
parentDF = java_gateway.get_field(grouped_data._jdf, javaDFName)
all_cols = DataFrame(parentDF, None).columns
key_cols_set = set(key_cols)
cols = [col for col in all_cols if col not in key_cols_set]
if "*" in cols:
raise ValueError("cols expected to contain only singular columns")
if len(set(cols)) < len(cols):
raise ValueError("cols expected not to contain duplicate columns")
if not isinstance(schema, StructType):
raise ValueError("output schema should be a StructType")
inputAggDF = grouped_data.agg({col: 'collect_list' for col in cols})
# Recover canonical order (aggregation may change column order)
cannonicalOrder = chain(key_cols, (inputAggDF['collect_list(' + col + ')'] for col in cols))
"""
import pandas as pd
minPandasVersion = '0.7.1'
if LooseVersion(pd.__version__) < LooseVersion(minPandasVersion):
raise ImportError('Pandas installed but version is {}, {} required'
.format(pd.__version__, minPandasVersion))
# Do a null aggregation to retrieve the keys first (should be no computation)
# Also consistent with spark.sql.retainGroupColumns
keySchema = grouped_data.agg({}).schema
keyCols = grouped_data.agg({}).columns
if not cols:
# Extract the full column list with the parent df
javaDFName = "org$apache$spark$sql$RelationalGroupedDataset$$df"
parentDF = java_gateway.get_field(grouped_data._jgd, javaDFName)
allCols = DataFrame(parentDF, None).columns
keyColsSet = set(keyCols)
cols = [col for col in allCols if col not in keyColsSet]
if "*" in cols:
raise ValueError("cols expected to contain only singular columns")
if len(set(cols)) < len(cols):
raise ValueError("cols expected not to contain duplicate columns")
if not isinstance(schema, StructType):
raise ValueError("output schema should be a StructType")
inputAggDF = grouped_data.agg({col: 'collect_list' for col in cols})
# Recover canonical order (aggregation may change column order)
canonicalOrder = chain(keyCols, [inputAggDF['collect_list(' + col + ')'] for col in cols])
paramRanges = []
##Using some bsearch code here thanks to Forrest Stonedahl and the NetLogo team
for paramSpec in paramSpecs:
paramRange = []
if (jg.is_instance_of(self.__gateway,paramSpec,"bsearch.space.DoubleDiscreteSpec") | jg.is_instance_of(self.__gateway,paramSpec,"bsearch.space.DoubleContinuousSpec")) :
count = paramSpec.choiceCount()
val_min = paramSpec.getValueFromChoice(0,count)
val_max = paramSpec.getValueFromChoice(count - 1,count)
step = (val_max - val_min)/(count - 1)
paramRange = [val_min,step,val_max]
if jg.is_instance_of(self.__gateway,paramSpec,"bsearch.space.CategoricalSpec"):
count = paramSpec.choiceCount()
paramRange = []
for choice in range(0,count):
paramRange.append(paramSpec.getValueFromChoice(choice,count))
if jg.is_instance_of(self.__gateway,paramSpec,"bsearch.space.ConstantSpec"):
paramRange = [paramSpec.getValueFromChoice(0,1)]
paramRanges.append(paramRange)
return paramRanges
def __getattr__(self, item):
if "java_fields" in self.__dict__ and item in self.__dict__["java_fields"]:
a = java_gateway.get_field(self.java_obj, item)
if gateway.jvm.pyboof.PyBoofEntryPoint.isConfigClass(a):
return JavaConfig(a)
else:
return a
else:
return object.__getattribute__(self, item)
def __setattr__(self, key, value):
if "java_fields" in self.__dict__ and key in self.__dict__["java_fields"]:
java_gateway.set_field(self.java_obj, key, value)
else:
self.__dict__[key] = value
def logistic_regression_gd(data, step_size=1.0, max_iters=10):
"""
:param data:
:param step_size:
:param max_iters:
:return:
"""
ml_obj = java_gateway.get_field(data._jddf, 'ML')
gateway = data._gateway_client
model = ml_obj.train('logisticRegressionWithSGD',
util.to_java_array([max_iters, step_size],
gateway.jvm.Object, gateway))
weights = [float(model.getRawModel().intercept())] + list(model.getRawModel().weights().toArray())
weights = pd.DataFrame(data=[weights], columns=['Intercept'] + data.colnames[:-1])
return LogisticRegressionModel(model, gateway, weights)
def get_property(self, name):
return java_gateway.get_field(self.java_obj,name)
def __getattr__(self, item):
if "java_fields" in self.__dict__ and item in self.__dict__["java_fields"]:
return java_gateway.get_field(self.java_obj, item)
else:
return object.__getattribute__(self, item)