Make life less miserable
Sort of...
Converting Data
Moving Data
Converting then Moving Data


https://csvkit.readthedocs.org/en/latest/
https://dataset.readthedocs.org/en/latest/
http://paulfitz.github.io/daff/
unix: cat, head, awk/sed, tail, |, grep, sort, uniq, cut...
Still -- There are problems
import csv
f = open("file.csv")
f_csv = csv.reader(f)
for line in f_csv...
f_csv.next()
...
munge()
get_money()df = pd.read_csv("/path/to/file.csv")pd.read_clipboard  
pd.read_csv        
pd.read_excel      
pd.read_fwf        
pd.read_gbq        
pd.read_hdf        
pd.read_html       
pd.read_json       
pd.read_msgpack    
pd.read_pickle     
pd.read_sql        
pd.read_sql_query  
pd.read_sql_table  
pd.read_stata
pd.read_tabledf.to_clipboard
df.to_csv
df.to_dense
df.to_dict
df.to_excel
df.to_gbq
df.to_hdf
df.to_html
df.to_json
df.to_latex
df.to_msgpack
df.to_panel
df.to_period
df.to_pickle
df.to_records
df.to_sparse
df.to_sql
df.to_stata
df.to_string
df.to_wideInput
Output
import pandas as pd
from glob import glob
df_list = []
csv_list = glob("iris*.csv")
for csv in csv_list:
  df_list.append(pd.read_csv(csv))
df = pd.concat(df_list)How about a pile of CSVs?
from odo import odo
import pandas as pd
df = odo("*.csv", pd.DataFrame)def read_csv_dir(...):
    df_list = []
    csv_list = glob(...)
    for csv in csv_list:
      df_list.append(pd.read_csv(csv))
    return pd.concat(df_list)
    Codify Function in a Framework
Can be Crazy?
Done everything?

conn = get_connect()
cursor = conn.cursor()
sql = "CREATE TABLE iris(s_length INT, s_width INT, ..., species TEXT);"
cursor.execute(sql)
data = read_csv(...)
sql = "INSERT INTO iris(s_length, s_width, ..., species) " \
                  "VALUES(?, ?, ..., ?)"
cursor.executemany(sql,data)
cursor.commit()
conn = get_connect()
df = pd.read_csv(...)
df.to_sql(name=tbl_name, con=conn...)
## pile of csvs
df = odo("*.csv", pd.DataFrame)
df.to_sql(...)Wait, what?!
What is `to_sql` doing?
Type Conversion
sqlalchemy.types
Table Building
Columns and Types
Chunked Insert
I knew a DBA -- she said,
"Bulk Loading is a Thing"
COPY FROM # PostgreSQL
LOAD DATA # MySQL
BULK INSERT # MS SQL
BCP
BULK COLLECT # ORACLE
.import # SQLITE
COPY {tblname} FROM '{abspath}'
    (FORMAT {format_str},
     DELIMITER E'{delimiter}',
     NULL '{na_value}',
     QUOTE '{quotechar}',
     ESCAPE '{escapechar}',
     HEADER {header},
     ENCODING '{encoding}');"""
odo("iris.csv", "sqlite:///iris.db")
odo("iris.csv", "postgresql://localhost::iris")Odo: A Framework for leveraging other peoples misery...
COPY {tblname} FROM '{abspath}'
    (FORMAT {format_str},
     DELIMITER E'{delimiter}',
     NULL '{na_value}',
     QUOTE '{quotechar}',
     ESCAPE '{escapechar}',
     HEADER {header},
     ENCODING '{encoding}');"""
odo("iris.csv", "sqlite:///iris.db")
odo("iris.csv", "postgresql://localhost::iris")Odo: A Framework for leveraging other people's misery...
odo("*.csv", db://hostname::tablename)
odo("iris.json", mongodb://hostname/db::collection")
odo("*.json", mongodb://hostname/db::collection")
## Full URI
sqldb://username:password@hostname:port/database::tablenameThat looks familiar...
# scp filename username@hostname:/path/to/file
# paramiko is an ssh client in Python
ssh.connect(...)
ftp = ssh.open_sftp() 
ftp.put(local_file, remote_file) 
odo('iris.csv', 'ssh://hostname:myfile.csv',
     username='user', key_filename='.ssh/id_rsa', port=22)
from pywebhdfs.webhdfs import PyWebHdfsClient
hdfs = PyWebHdfsClient(host=host, port=str(port),
                                  user_name=user)
hdfs.read_file(...)
hdfs.append_file(...)
pytables
h5py
HDFSTORE
No Var Len Str
No Python
missing data
some rough edges
kind of new
odo('s3://mybucket/path/to/data.json',
    'postgresql://user:passwd@localhost:port/db::data')
Not a new idea:
import pandas.io.data as web
web.DataReader("IBM", 'yahoo', start, end)
web.DataReader("IBM", 'google', start, end)
$ FRED
$ Yahoo! Option
$ Fama/French
...@append.register(CSV, URL(CSV))
def append_urlX_to_X(target, source, **kwargs):
    with closing(urlopen(source.url)) as r:
        chunk_size = 16 * source.chunk_size
        with open(target.path, 'wb') as fp:
            for chunk in iter(curry(r.read, chunk_size), ''):
                fp.write(chunk)
                return target
@discover.register((URL(CSV), URL(JSONLines)))
def discover_url_line_delimited(c, lines=5, encoding='utf-8', **kwargs):
    """Discover CSV and JSONLines files from URL."""
    with sample(c, lines=lines, encoding=encoding) as fn:
        return discover(c.subtype(fn, **kwargs), **kwargs)Sample: sip a few lines of data
@convert.register(Temp(CSV), (Temp(URL(CSV)), URL(CSV)))
def url_file_to_temp_file(data, **kwargs):
    fn = '.%s' % uuid.uuid1()
    target = Temp(data.subtype)(fn, **kwargs)
    return append(target, data, **kwargs)
data: url(csv) Temp(CSV): CSV with garbage collection
@resource.register('ftp://.+', priority=16)
@resource.register('http://.+', priority=16)
@resource.register('https://.+', priority=16)
def resource_url(uri, **kwargs):
    path = os.path.basename(urlparse(uri).path)
    try:
        subtype = types_by_extension[ext(path)]
    except KeyError:
        subtype = type(resource(path))
    return URL(subtype)(uri, **kwargs)
from odo import odo
import pandas as pd
url = "http://foo.com/file.csv"
df = odo(url, pd.DataFrame)
auth = {'user': 'hdfs','port': '14000'}
odo(url, 'hdfs://hdfs_ip:/tmp/iris/iris.txt', **auth)
conda install odo
pip install odo