from voluptuous import Schema

schema = Schema({
    Required('sale_amount'): All(float, 
                                 Range(min=2.50, max=1450.99)),
}, extra=ALLOW_EXTRA)


error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', 
                        s_id, sale['sale_amount'], e)
        error_count += 1

Data Validation with Voluptuous

- Schema Validation

 

 

from tdda.constraints.pdconstraints import discover_constraints, verify_df
    
constraints = discover_constraints(df)

with open('../data/ignore-iot_constraints.tdda', 'w') as f:
    f.write(constraints.to_json())


new_df = pd.read_csv('../data/iot_example_with_nulls.csv')

v = verify_df(new_df, '../data/ignore-iot_constraints.tdda')


v.passes
v.failures

TDDA: Test-Driven Data Analysis

- Constraint Detection

 

 

# save a version of your data
>>> dora.data
   A   B  C      D  useless_feature
0  1   2  0   left                1
1  4 NaN  1  right                1
2  7   8  2   left                1
>>> dora.snapshot('initial_data')

# keep track of changes to data
>>> dora.remove_feature('useless_feature')
>>> dora.impute_missing_values()
>>> dora.data
   A         B         C    D=left   D=right
0  1 -1.224745 -1.224745  0.707107 -0.707107
1  4  0.000000  0.000000 -1.414214  1.414214
2  7  1.224745  1.224745  0.707107 -0.707107

>>> dora.logs
["self.remove_feature('useless_feature')", 'self.impute_missing_values()']

Data Versioning: Dora

Resources

Overview of different R packages

 

https://arxiv.org/pdf/1904.02101.pdf

 

 

 

Other Python libraries

String matching: fuzzywuzzy

Made with Slides.com