PyGotham 2018
Molly Leen
"BIG" data
{"fname": "Spongebob", "lname": "Squarepants", "type": "sponge"}
{"fname": "Patrick", "lname": "Star", "type": "starfish"}
{"fname": "Squidward", "lname": "Tentacles", "type": "squid"}
{"fname": "Eugene", "lname": "Krabs", "type": "crab"}
{"fname": "Sandy", "lname": "Cheeks", "type": "squirrel"}
{"fname": "Sheldon", "lname": "Plankton", "type": "plankton"}
{"fname": "Gary", "lname": "The Snail", "type": "snail"}
raw data
API server
POST
{
"fname": "Spongebob",
"lname": "Squarepants",
"type": "Sponge"
}
ingestion script
INSERT
Processing one by one is SLOW
Use COPY to load all the rows in one command, instead of using a series of INSERT commands. The COPY command is optimized for loading large numbers of rows; it is less flexible than INSERT, but incurs significantly less overhead for large data loads.
From the PostgreSQL docs:
COPY takes data from a file which is formatted to match the structure of the table
"Spongebob,Squarepants,sponge"
"Patrick,Star,starfish"
"Squidward,Tentacles,squid"
"Eugene,Krabs,crab"
"Sandy,Cheeks,squirrel"
"Sheldon,Plankton,plankton"
"Gary,The\ Snail,snail"
fname | lname | type
---------------------
| |
COPY imports data from a file
The file is not on the server
raw data
API server
with requests.get(url, stream=True) as s3_data:
# Data is now a generator we can access
# in a loop or by calling data.next()
print(s3_data.next())
{"fname": "Spongebob", "lname": "Squarepants", "type": "sponge"}
prints...
According to the python docs:
Can we make our generator LOOK like a file?
Generator |
File |
---|---|
next() to get next line |
readline() to get next line |
raises StopIteration when done |
returns an empty string when done |
def __init__(self, s3_data):
self.data = s3_data # generator
def readline(self):
try:
return self.data.next()
except StopIteration:
return ''
def read(self, size=-1):
return self.readline()
class FileLike(object):
def readline(self):
def readline(self):
return self.data.next()
with requests.get(url, stream=True) as s3_data:
file_like_data = FileLike(s3_data)
print(file_like_data.readline())
{"fname": "Spongebob", "lname": "Squarepants", "type": "sponge"}
prints...
File must be a CSV that matches the format of the table
The file is JSONLines
{"fname": "Spongebob", "lname": "Squarepants", "type": "sponge"}
{"fname": "Patrick", "lname": "Star", "type": "starfish"}
{"fname": "Squidward", "lname": "Tentacles", "type": "squid"}
{"fname": "Eugene", "lname": "Krabs", "type": "crab"}
{"fname": "Sandy", "lname": "Cheeks", "type": "squirrel"}
{"fname": "Sheldon", "lname": "Plankton", "type": "plankton"}
{"fname": "Gary", "lname": "The Snail", "type": "snail"}
{"fname": "Spongebob", "lname": "Squarepants", "type": "sponge"}
"Spongebob,Squarepants,sponge"
["Spongebob", "Squarepants", "sponge"]
"Spongebob,Squarepants,sponge"
["Gary", "The Snail", "snail"]
"Gary,The\ Snail,snail"
class FileQueue(Queue):
def write(self, data):
self.put(data)
def read(self):
return self.get()
JSON
{"fname": "Spongebob", "lname": "Squarepants", "type": "sponge"}
List (prep_for_csv step)
["Spongebob", "Squarepants", "sponge"]
Write to CSV (formats and "puts" to FileQueue)
Read from FileQueue and return
"Spongebob,Squarepants,sponge"
def __init__(self, s3_data):
self.data = s3_data # generator
def readline(self):
try:
return self.data.next()
except StopIteration:
return ''
def read(self, size=-1):
return self.readline()
class FileLike(object):
except StopIteration:
return ''
data =
# converts JSON to list of values
data_for_csv = prep_for_csv(data)
self.file_queue = FileQueue()
self.writer = csv.writer(self.file_queue)
self.writer.writerow(data_for_csv)
return self.file_queue.read()
with requests.get(url, stream=True) as s3_data:
file_like_data = FileLike(s3_data)
print(file_like_data.readline())
"Spongebob,Squarepants,sponge"
prints...
with requests.get(url, stream=True) as s3_data:
file_like_data = FileLike(s3_data=data)
table = 'spongebob_characters'
columns = ['fname', 'lname', 'type']
cursor.copy_from(file_like_data, table,
columns=columns, sep=',')
The copy_from command reads from our "file"
and all processing happens when data is read!
API server
raw data
ingestion script
COPY
{
"file": "s3://spongebob"
}