Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_validate_warnings_table_limit():
source = 'data/datapackages/invalid/datapackage.json'
report = validate(source, preset='datapackage', table_limit=1)
assert len(report['warnings']) == 1
assert 'table(s) limit' in report['warnings'][0]
def test_validate_table_invalid_row_limit(log):
report = validate('data/invalid.csv', row_limit=2, infer_schema=True)
assert log(report) == [
(1, None, 3, 'blank-header'),
(1, None, 4, 'duplicate-header'),
(1, 2, 3, 'missing-value'),
(1, 2, 4, 'missing-value'),
]
def test_validate_infer_datapackage_dict(log):
with open('data/datapackages/invalid/datapackage.json') as file:
report = validate(json.load(file))
assert report['error-count'] == 2
def test_check_maximum_constraint(log):
source = [
['row', 'score'],
[2, 1],
[3, 2],
[4, 3],
[5, 4],
[6],
]
schema = {'fields': [
{'name': 'row', 'type': 'integer'},
{'name': 'score', 'type': 'integer', 'constraints': {'maximum': 2}}
]}
report = validate(source, schema=schema, checks=[
'maximum-constraint',
])
assert log(report) == [
(1, 4, 2, 'maximum-constraint'),
(1, 5, 2, 'maximum-constraint'),
]
def test_check_deviated_value_not_enough_data(log):
source = [
['temperature'],
[1],
]
report = validate(source, checks=[
{'deviated-value': {'column': 'temperature'}},
])
assert log(report) == []
def test_check_file_integrity(log):
source = deepcopy(DESCRIPTOR)
report = validate(source)
assert log(report) == []
'data': [
['id1', 'id2'],
['a', '1'],
['a', '1'],
],
'schema': {
'fields': [
{'name': 'id1'},
{'name': 'id2'},
],
'primaryKey': ['id1', 'id2']
}
}
],
}
report = validate(descriptor, skip_checks=['duplicate-row'])
assert log(report) == [
(1, 3, 1, 'unique-constraint'),
]
microsecond=0).isoformat() + 'Z'
# Have to set this to 'data-package' rather than 'tabular-data-package'
# due to a DataHub.io bug
pkg.descriptor['profile'] = 'data-package'
pkg.commit()
# save the datapackage
print("Validating pudl-msha data package")
if not pkg.valid:
print("PUDL MSHA DATA PACKAGE IS NOT VALID.")
return 1
pkg.save(os.path.join(output_dir, 'datapackage.json'))
# Validate some of the data...
print("Validating pudl-msha data")
report = goodtables.validate(os.path.join(
output_dir, 'datapackage.json'), row_limit=args.row_limit)
if not report['valid']:
print("PUDL MSHA DATA TABLES FAILED TO VALIDATE")
pprint(report)
return 1
shutil.copyfile(os.path.join(input_dir, "README.md"),
os.path.join(output_dir, "README.md"))
shutil.copyfile(os.path.join(input_dir, sys.argv[0]),
os.path.join(output_dir, "scripts", sys.argv[0]))
return 0
def validate(source, return_tables=False):
# Start clock
start = datetime.datetime.now()
# Initialize report
checks = [
'blank-header',
'duplicate-header',
'non-matching-header',
'extra-header',
'missing-header'
]
report = goodtables.validate(
source=source, preset='datapackage', checks=checks,
table_limit=math.inf, row_limit=0)
# Remove row_limit warnings
report['warnings'] = [w for w in report['warnings']
if re.match('row\(s\) limit', w)]
# Retrieve descriptor
package = datapackage.Package(source).descriptor
resources = package.get('resources', [])
names = [resource['name'] for resource in resources]
# Expand descriptor
for resource, name in zip(resources, names):
schema = resource['schema']
if 'primaryKey' in schema:
schema['primaryKey'] = _as_list(schema['primaryKey'])
if 'uniqueKeys' in schema:
schema['uniqueKeys'] = [_as_list(k) for k in schema['uniqueKeys']]
}
# Use that descriptor to instantiate a Package object
data_pkg = datapackage.Package(pkg_descriptor)
# Validate the data package descriptor before we go to
if not data_pkg.valid:
logger.warning(f"""
Invalid tabular data package: {data_pkg.descriptor["name"]}
Errors: {data_pkg.errors}""")
data_pkg.save(pkg_json)
if not dry_run:
# Validate the data within the package using goodtables:
report = goodtables.validate(pkg_json, row_limit=100_000)
if not report['valid']:
logger.warning("Data package data validation failed.")
return data_pkg