building

an

ETL FRAMEWORK

GOAL

To develop a Python toolkit capable of supporting

complex ETL workflows by creating a "common language" for heterogenous data sources.

use case

Address Information System (AIS)

Old AIS

import cx_Oracle

# Connect to DB
db = cx_Oracle.connect('ais/password@ais_source')
cursor = db.cursor()

# Get parcel IDs and geometry
sql = '''
    select parcel_id, sdo_util.to_wktgeometry(shape)
    from parcels
    where address = '1 ALLEN LN'
'''

cursor.execute(sql)
parcels = cursor.fetchall()

for parcel in parcel:
    # Do something here

# Tediously form giant insert statement
insert_stmt = 'insert into parcels values({})'

New AIS

from db import connect_to_db

# Connect to DB
db = connect_to_db(db_type='postgres', dsn='ais/password@ais')

# Get parcel ID and geometry
parcels = db.read('parcels', ['parcel_id'], geom_field='shape', where="address = '1 ALLEN LN'"

for parcel in parcels:
    parcel_id = parcel['parcel_id']
    # Do something here...

# Write out
db.truncate('parcels')
db.bulk_insert('parcels', parcels, geom_field='shape', to_srid=4326, chunk_size=100000)
db.create_index('parcels', 'parcel_id')

config.py

CONFIG = {
    'db': {
        'ais_work': {
            'db_type': 'postgres',
            'dsn': 'dbname=ais_newschema host=192.168.104.170 user=postgres \
                password=pwd',
        },
        'ulrs': {
            'db_type': 'oracle',
            'dsn': 'gis_ulrs2/pwd@gis'
        },
    },

    'addresses': {
        'sources': [
            {
                'table':                 'opa_property',
                'db':                    'ais_work',
                'address_field':         'street_address',
                'key_fields': [
                    {
                        'source_name':        'account_num',
                        'name':                'opa_account_num',
                        'type':                'text'
                    },
                    {
                        'source_name':        'pwd_parcel_id',
                        'name':               'pwd_parcel_id',
                        'type':               'integer'
                    }
                ]
            },
            {
                'table':                 'pwd_parcel',
                'db':                    'ais_work',
                'address_field':         'street_address',
                'key_fields': [
                    {
                        'source_name':        'parcel_id',
                        'name':               'pwd_parcel_id',
                        'type':               'integer'
                    }
                ]
            },

benefits

  • Less time writing code
  • Not database-specific; no SQL statements
  • Config-able

how could we make this generic enough to support a variety of etl workflows?

ais db module +

TRANSFORMS +

hygiene CHECKS +

IN-MEMORY DB CAPABILITY +

MORE ROBUST SYNTAX =

ingress

connect to anyThing

  • SDE
  • PostGIS
  • Oracle Spatial
  • SQL Server
  • CSV
  • Excel
  • GeoJSON
  • AGO
  • Socrata
  • SFTP
  • Access
  • FoxPro
  • DB/2
  • ...

Connection spec lets you write a data adapter for anything that is Python-accessible.

native python objects

SECRET SAUCE:

TransformS

import ingress

# Connect to DB
db = ingress.connect('postgres://user:password@localhost:5432/db')

# Read in 5 random parcels sorted by address
parcels = db.table('parcels').sort('address').random().limit(5)

# Convert parcel ID to an integer
parcels.transform('parcel_id', int)

# Make address uppercase
parcels.transform('address', 'uppercase')

# Apply a custom function
def standardize_address(addr):
    # Do complicated things here...
    return addr

parcels.transform('address', standardize_address)

# Write back out
db.table('parcels').reload(parcels)

# Get errors
for error in parcels.errors:
    print(error)

hygiene CHECKS

import ingress

# Connect to OPA Property file
props = ingress.connect('csv://path/to/prop2016.txt')

# Make sure numeric fields are numeric. Any non-passing rows
# are set aside and may be logged.
props.check('unit_num', 'isnumeric')\
     .check('house_num', 'isnumeric')

# Make sure string fields are < 255 characters
props.check('owner', lambda x: len(x) < 255)

# Write out passing rows to GeoDB2
geodb2 = bodega.connect('oracle+sde://gis_opa:password@gis')
geodb2.table('real_estate').reload(props)

IN-memory database

import ingress
from shapely.geometry import Point

# Connect to DB
db = ingress.connect('postgres://user:password@localhost:5432/db')

# Read in service areas
service_areas = db.table('service_areas').all()

# Make spatial index
service_areas.create_spatial_index('geometry')

# Loop over addresses
for address in db.table('addresses').all():
    xy = address['geometry']
    service_area = service_areas.intersect(xy)

    # Do something else...

Building an ETL Framework

By rbrtmrtn

Building an ETL Framework

  • 404