Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_wrong_encoding_replaces(self):
data_source = os.path.join(self.data_dir, 'hmt','BIS_spending_over__25_000_July_2014.csv')
encoding = 'UTF-8' # should be 'ISO-8859-2'
decode_strategy = 'replace'
data = datatable.DataTable(data_source, encoding=encoding, decode_strategy=decode_strategy)
self.assertTrue(data)
def test_excel_from_url(self):
data_source = 'https://github.com/okfn/goodtables/raw/master/examples/hmt/BIS_monthly_spend_December_2012.xls'
data = datatable.DataTable(data_source, format='excel')
self.assertTrue(data.headers)
def test_set_decoding_on_self_when_passed(self):
data_source = os.path.join(self.data_dir, 'jungle','VilleMTP_MTP_BudgetPri_2015.csv')
encoding = 'windows-1252'
data = datatable.DataTable(data_source, encoding=encoding)
self.assertEqual(data.encoding, data.passed_encoding)
return False
return run_valid
valid = True
openfiles = []
if is_table:
data = data_source
else:
try:
data = datatable.DataTable(data_source, headers=headers,
format=format, encoding=encoding,
decode_strategy=decode_strategy,
header_index=self.header_index)
openfiles.extend(data.openfiles)
except datatable.DataTable.RAISES as e:
valid = False
data = None
if isinstance(e, exceptions.DataSourceHTTPError):
error_type = 'http_{0}_error'.format(e.status)
elif isinstance(e, exceptions.DataSourceDecodeError):
error_type = 'data_decode_error'
elif isinstance(e, exceptions.DataSourceFormatUnsupportedError):
error_type = 'data_{0}_error'.format(e.file_format)
elif isinstance(e, exceptions.DataSourceMalformatedError):
error_type = 'invalid_{0}_error'.format(format)
entry = self.make_entry(
processor='base',
result_category=self.RESULT_CATEGORY_FILE,
result_level=self.RESULT_LEVEL_ERROR,
result_message=e.msg,
"""
def _run_valid(process_valid, run_valid):
"""Set/maintain the valid state of the run."""
if not process_valid and run_valid:
return False
return run_valid
valid = True
openfiles = []
if is_table:
data = data_source
else:
try:
data = datatable.DataTable(data_source, headers=headers,
format=format, encoding=encoding,
decode_strategy=decode_strategy,
header_index=self.header_index)
openfiles.extend(data.openfiles)
except datatable.DataTable.RAISES as e:
valid = False
data = None
if isinstance(e, exceptions.DataSourceHTTPError):
error_type = 'http_{0}_error'.format(e.status)
elif isinstance(e, exceptions.DataSourceDecodeError):
error_type = 'data_decode_error'
elif isinstance(e, exceptions.DataSourceFormatUnsupportedError):
error_type = 'data_{0}_error'.format(e.file_format)
elif isinstance(e, exceptions.DataSourceMalformatedError):
error_type = 'invalid_{0}_error'.format(format)
report_backend = 'yaml'
report_options = {
'schema': helpers.report_schema,
'backend': report_backend,
'client_stream': self.report_stream,
'limit': self.report_limit,
'post_task': self.report_post_task
}
self.report = tellme.Report('Pipeline', **report_options)
self.pipeline = self.get_pipeline()
try:
self.data = datatable.DataTable(self.data_source, format=self.format,
encoding=encoding,
decode_strategy=decode_strategy,
header_index=self.header_index)
self.openfiles.extend(self.data.openfiles)
except datatable.DataTable.RAISES:
self.data = self.data_source
def get_dataset_csv(self):
"""Get the dataset from a CSV file for this batch process."""
dataset = []
resources = datatable.DataTable(self.source, encoding='utf-8')
data_index = resources.headers.index(self.data_key)
keys_header_index = {}
for key in [self.schema_key, self.format_key, self.encoding_key]:
if key in resources.headers:
keys_header_index[key] = resources.headers.index(key)
for entry in resources.values:
rv = {'data': entry[data_index], 'schema': None, 'encoding': None,
'format': None}
for key, index in keys_header_index.items():
if index is not None:
rv[key] = entry[index]
def run(self):
"""Run the pipeline."""
def _run_valid(process_valid, run_valid):
"""Set/maintain the valid state of the run."""
if not process_valid and run_valid:
return False
return run_valid
valid = True
for processor in self.pipeline:
if isinstance(self.data, datatable.DataTable):
_valid, _, self.data = processor.run(self.data, is_table=True,
encoding=self.encoding, decode_strategy=self.decode_strategy)
else:
_valid, _, self.data = processor.run(self.data_source, is_table=False,
decode_strategy=self.decode_strategy,
encoding=self.encoding, format=self.format)
valid = _run_valid(_valid, valid)
# if a validator returns invalid, we stop the pipeline,
# unless break_on_invalid_processor is False
if not valid and self.break_on_invalid_processor:
break
if self.data:
self.data.replay()