from voluptuous import Schema
schema = Schema({
Required('sale_amount'): All(float,
Range(min=2.50, max=1450.99)),
}, extra=ALLOW_EXTRA)
error_count = 0
for s_id, sale in sales.T.to_dict().items():
try:
schema(sale)
except MultipleInvalid as e:
logging.warning('issue with sale: %s (%s) - %s',
s_id, sale['sale_amount'], e)
error_count += 1
Data Validation with Voluptuous
from tdda.constraints.pdconstraints import discover_constraints, verify_df
constraints = discover_constraints(df)
with open('../data/ignore-iot_constraints.tdda', 'w') as f:
f.write(constraints.to_json())
new_df = pd.read_csv('../data/iot_example_with_nulls.csv')
v = verify_df(new_df, '../data/ignore-iot_constraints.tdda')
v.passes
v.failures
TDDA: Test-Driven Data Analysis
# save a version of your data
>>> dora.data
A B C D useless_feature
0 1 2 0 left 1
1 4 NaN 1 right 1
2 7 8 2 left 1
>>> dora.snapshot('initial_data')
# keep track of changes to data
>>> dora.remove_feature('useless_feature')
>>> dora.impute_missing_values()
>>> dora.data
A B C D=left D=right
0 1 -1.224745 -1.224745 0.707107 -0.707107
1 4 0.000000 0.000000 -1.414214 1.414214
2 7 1.224745 1.224745 0.707107 -0.707107
>>> dora.logs
["self.remove_feature('useless_feature')", 'self.impute_missing_values()']
Data Versioning: Dora
https://github.com/great-expectations/great_expectations
https://github.com/engarde-dev/engarde (no longer active development)
similar : https://github.com/zaxr/bulwark
https://github.com/frictionlessdata/goodtables-py
String matching: fuzzywuzzy