Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dta_file = os.path.join(self.missing_data_folder, "missing_test.dta")
unformatted_csv = os.path.join(self.missing_data_folder, "missing_unformatted.csv")
formatted_csv = os.path.join(self.missing_data_folder, "missing_dta_formatted.csv")
labeled_csv = os.path.join(self.missing_data_folder, "missing_dta_labeled.csv")
df_sas, meta = pyreadstat.read_dta(dta_file)
df_csv = pd.read_csv(unformatted_csv)
self.assertTrue(df_sas.equals(df_csv))
df_sas, meta = pyreadstat.read_dta(dta_file, user_missing=True)
df_csv = pd.read_csv(formatted_csv)
self.assertTrue(df_sas.equals(df_csv))
missing_user_values = {'var1':['a'],'var2': ['b'], 'var3':['c'], 'var4':['x'], 'var5':['y'], 'var6':['z']}
self.assertDictEqual(meta.missing_user_values, missing_user_values)
df_sas, meta = pyreadstat.read_dta(dta_file,
apply_value_formats=True, user_missing=True,
formats_as_category=False)
df_csv = pd.read_csv(labeled_csv)
self.assertTrue(df_sas.equals(df_csv))
def test_dta_usecols(self):
df, meta = pyreadstat.read_dta(os.path.join(self.basic_data_folder, "sample.dta"), usecols=self.usecols)
df_pandas = self.df_usecols.copy()
df_pandas["myord"] = df_pandas["myord"].astype(np.int64)
self.assertTrue(df.equals(df_pandas))
self.assertTrue(meta.number_columns == len(self.usecols))
self.assertTrue(meta.column_names == self.usecols)
def test_dta_metaonly(self):
df, meta = pyreadstat.read_dta(os.path.join(self.basic_data_folder, "sample.dta"))
df2, meta2 = pyreadstat.read_dta(os.path.join(self.basic_data_folder, "sample.dta"), metadataonly=True)
self.assertTrue(df2.empty)
self.assertTrue(meta.number_columns == meta2.number_columns)
self.assertTrue(meta.number_rows == meta2.number_rows)
self.assertTrue(meta.column_names == meta2.column_names)
self.assertTrue(meta.column_labels == meta2.column_labels)
def test_dta(self):
# discard dtime and arrange time
df, meta = pyreadstat.read_dta(os.path.join(self.basic_data_folder, "sample.dta"))
df_pandas = self.df_pandas.copy()
df_pandas["myord"] = df_pandas["myord"].astype(np.int64)
df_pandas["mylabl"] = df_pandas["mylabl"].astype(np.int64)
self.assertTrue(df.equals(df_pandas))
self.assertTrue(meta.number_columns == len(df_pandas.columns))
self.assertTrue(meta.number_rows == len(df_pandas))
def test_dta_metaonly(self):
df, meta = pyreadstat.read_dta(os.path.join(self.basic_data_folder, "sample.dta"))
df2, meta2 = pyreadstat.read_dta(os.path.join(self.basic_data_folder, "sample.dta"), metadataonly=True)
self.assertTrue(df2.empty)
self.assertTrue(meta.number_columns == meta2.number_columns)
self.assertTrue(meta.number_rows == meta2.number_rows)
self.assertTrue(meta.column_names == meta2.column_names)
self.assertTrue(meta.column_labels == meta2.column_labels)
#formatted_csv = os.path.join(self.missing_data_folder, "missing_dta_formatted.csv")
#labeled_csv = os.path.join(self.missing_data_folder, "missing_dta_labeled.csv")
df_csv = pd.DataFrame([[3,"a"],["a","b"]], columns=["Var1", "Var2"])
df_csv2 = pd.DataFrame([[3,"a"],["labeled","b"]], columns=["Var1", "Var2"])
missing_user_values = {'Var1': ['a']}
variable_value_labels = {'Var1':{'a':'labeled'}}
path = os.path.join(self.write_folder, "user_missing_write.dta")
pyreadstat.write_dta(df_csv, path, version=12, missing_user_values=missing_user_values, variable_value_labels=variable_value_labels)
df_dta, meta = pyreadstat.read_dta(path, user_missing=True)
self.assertTrue(df_csv.equals(df_dta))
self.assertDictEqual(meta.missing_user_values, missing_user_values)
df_dta2, meta2 = pyreadstat.read_dta(path, user_missing=True, apply_value_formats=True, formats_as_category=False)
self.assertTrue(df_csv2.equals(df_dta2))
def test_dta_write_basic(self):
df_pandas = self.df_pandas.copy()
df_pandas["myord"] = df_pandas["myord"].astype(np.int32)
df_pandas["mylabl"] = df_pandas["mylabl"].astype(np.int32)
file_label = "basic write"
col_labels = ["mychar label","mynum label", "mydate label", "dtime label", None, "myord label", "mytime label"]
variable_value_labels = {'mylabl': {1: 'Male', 2: 'Female'}, 'myord': {1: 'low', 2: 'medium', 3: 'high'}}
path = os.path.join(self.write_folder, "basic_write.dta")
pyreadstat.write_dta(df_pandas, path, file_label=file_label, column_labels=col_labels, version=12, variable_value_labels=variable_value_labels)
df, meta = pyreadstat.read_dta(path)
df_pandas["myord"] = df_pandas["myord"].astype(np.int64)
df_pandas["mylabl"] = df_pandas["mylabl"].astype(np.int64)
self.assertTrue(df.equals(df_pandas))
self.assertEqual(meta.file_label, file_label)
self.assertListEqual(meta.column_labels, col_labels)
self.assertDictEqual(meta.variable_value_labels, variable_value_labels)
def test_dta_user_missing(self):
dta_file = os.path.join(self.missing_data_folder, "missing_test.dta")
unformatted_csv = os.path.join(self.missing_data_folder, "missing_unformatted.csv")
formatted_csv = os.path.join(self.missing_data_folder, "missing_dta_formatted.csv")
labeled_csv = os.path.join(self.missing_data_folder, "missing_dta_labeled.csv")
df_sas, meta = pyreadstat.read_dta(dta_file)
df_csv = pd.read_csv(unformatted_csv)
self.assertTrue(df_sas.equals(df_csv))
df_sas, meta = pyreadstat.read_dta(dta_file, user_missing=True)
df_csv = pd.read_csv(formatted_csv)
self.assertTrue(df_sas.equals(df_csv))
missing_user_values = {'var1':['a'],'var2': ['b'], 'var3':['c'], 'var4':['x'], 'var5':['y'], 'var6':['z']}
self.assertDictEqual(meta.missing_user_values, missing_user_values)
df_sas, meta = pyreadstat.read_dta(dta_file,
apply_value_formats=True, user_missing=True,
formats_as_category=False)
df_csv = pd.read_csv(labeled_csv)
self.assertTrue(df_sas.equals(df_csv))
def test_dta_user_missing(self):
dta_file = os.path.join(self.missing_data_folder, "missing_test.dta")
unformatted_csv = os.path.join(self.missing_data_folder, "missing_unformatted.csv")
formatted_csv = os.path.join(self.missing_data_folder, "missing_dta_formatted.csv")
labeled_csv = os.path.join(self.missing_data_folder, "missing_dta_labeled.csv")
df_sas, meta = pyreadstat.read_dta(dta_file)
df_csv = pd.read_csv(unformatted_csv)
self.assertTrue(df_sas.equals(df_csv))
df_sas, meta = pyreadstat.read_dta(dta_file, user_missing=True)
df_csv = pd.read_csv(formatted_csv)
self.assertTrue(df_sas.equals(df_csv))
missing_user_values = {'var1':['a'],'var2': ['b'], 'var3':['c'], 'var4':['x'], 'var5':['y'], 'var6':['z']}
self.assertDictEqual(meta.missing_user_values, missing_user_values)
df_sas, meta = pyreadstat.read_dta(dta_file,
apply_value_formats=True, user_missing=True,
formats_as_category=False)
df_csv = pd.read_csv(labeled_csv)
self.assertTrue(df_sas.equals(df_csv))
def test_dta_write_user_missing(self):
#formatted_csv = os.path.join(self.missing_data_folder, "missing_dta_formatted.csv")
#labeled_csv = os.path.join(self.missing_data_folder, "missing_dta_labeled.csv")
df_csv = pd.DataFrame([[3,"a"],["a","b"]], columns=["Var1", "Var2"])
df_csv2 = pd.DataFrame([[3,"a"],["labeled","b"]], columns=["Var1", "Var2"])
missing_user_values = {'Var1': ['a']}
variable_value_labels = {'Var1':{'a':'labeled'}}
path = os.path.join(self.write_folder, "user_missing_write.dta")
pyreadstat.write_dta(df_csv, path, version=12, missing_user_values=missing_user_values, variable_value_labels=variable_value_labels)
df_dta, meta = pyreadstat.read_dta(path, user_missing=True)
self.assertTrue(df_csv.equals(df_dta))
self.assertDictEqual(meta.missing_user_values, missing_user_values)
df_dta2, meta2 = pyreadstat.read_dta(path, user_missing=True, apply_value_formats=True, formats_as_category=False)
self.assertTrue(df_csv2.equals(df_dta2))