Barrett Strausser
@FuncBearrito
sudo vi ~/.bashrc || sudo vi ~/.bash_profile
export EC2_HOME = ...
export EC2_PRIVATE_KEY=pk-foo.pem
export EC2_CERT=cert-foo.pem
export EC2_KEYPAIR=...
export EC2_URL=...
$sudo apt-get install python,pip
$pip install boto
Machine-1 2012-11-28 22:42:00 0.1722 0.108 6.2504
Machine-2 2012-11-28 22:42:00 0.0185 0.3336 3.316
Machine-3 2012-11-28 22:42:00 1.6843 0.2725 1.3314
Machine-1 2012-11-28 22:42:00 0.1482 0.1422 0.3965
records = []
with open('/home/me/Git/EMR-DEMO/code/resources/mapper_input','w+') as f:
for i in range(10000):
input = machine_name = " Machine-" + str((i % 3) + 1) + '\t'
input += str(datetime.datetime.today().replace(second=0, microsecond=0)) + '\t'
input += str(round(random.normalvariate((i % 3),1),4)) + '\t'
input += str(round( random.expovariate(3),4) )+ '\t'
input += str(round(random.lognormvariate(0,2),4)) + '\n'
f.write(input)
#!/usr/bin/env python
# encoding: utf-8
import sys
import decimal
def some_function(sensor_record):
numerical_record = []
for record in sensor_record:
scaled = decimal.Decimal(record).quantize(decimal.Decimal('.001'),rounding=decimal.ROUND_DOWN)
numerical_record.append(scaled)
return ''.join(map(lambda x: str(x) + ' ',numerical_record))
def record_cleaner(record):
record = record.strip()
(key,date,c1,c2,c3) = record.split('\t')
values = [c1,c2,c3]
return (key,values)
def process(count,record) :
count += 1
(key,values) = record_cleaner(record)
transformed_data = some_function(values)
output = '%s\t%s:%s' % (key,count,transformed_data)
print output
return count
count = 0
for record in sys.stdin :
count = process(count,record)
import sys
import decimal
import math
machine_name = None
key = ''
machine = 'foo'
print >> sys.stderr, 'Starting'
for record in sys.stdin:
if(record != None and record != '') :
(k,i,v) = record_cleaner(record)
if(machine_name == None):
record_count = 0
running_sum = [0,0,0]
machine_name = k
if(machine_name != k):
estimates = estimate_parameters(record_count,running_sum)
print '%s\t%s:%s:%s:%s' % (machine_name,estimates[0],estimates[1],estimates[2],estimates[3])
machine_name = k
#print >> sys.stderr, 'machine ' + machine_name
record_count = 0
running_sum = [0,0,0]
(record_count,running_sum) = process(record_count,running_sum,record)
else:
#print >> sys.stderr, 'processing %s' % record
(record_count,running_sum) = process(record_count,running_sum,record)
estimates = estimate_parameters(record_count,running_sum)
print '%s\t%s:%s:%s:%s' % (machine_name,estimates[0],estimates[1],estimates[2],estimates[3])
def process(record_count,running_sum,record):
record_count += 1
record = record_cleaner(record)
data_tuple = record_to_datatuple(record[2])
running_sum = compute_sum(running_sum,data_tuple)
t = (record_count,running_sum)
return t
def record_cleaner(record):
record = record.strip()
split_record = record.split('\t')
key = split_record[0]
split_record = split_record[1].split(":")
id = split_record[0]
value = split_record[1]
return (key,id,value)
def record_to_datatuple(record):
split_record = record.split(" ")
numerical_record = []
for record in split_record:
scaled = decimal.Decimal(record).quantize(decimal.Decimal('.001'),rounding=decimal.ROUND_DOWN)
numerical_record.append(scaled)
return numerical_record
def compute_sum(running_sum,data_tuple):
try:
running_sum[0] += data_tuple[0]
running_sum[1] += data_tuple[1]
running_sum[2] += data_tuple[2]
except :
print >> sys.stderr, 'error in running sum with tuple %s' % data_tuple
return running_sum
def estimate_parameters(record_count,running_sum):
normal_estimate = running_sum[0] / record_count
exponential_estimate = 1 / (running_sum[1] /record_count)
log_sum = running_sum[2]
log_u = log_sum / record_count
log_numerator = log_sum*log_sum - log_sum*log_u + log_u*log_u
log_sigma = log_numerator / record_count
return (normal_estimate,exponential_estimate,log_u,log_sigma)
import boto
from boto.s3.key import Key
from boto.emr.step import StreamingStep
#you will need your own bucket
root_path = '/home/me/Git/EMR-DEMO/code/'
s3 = boto.connect_s3()
emr_demo_bucket = s3.create_bucket('bearrito.demos.emr')
emr_demo_bucket.set_acl('private')
json_records = Key(emr_demo_bucket)
json_records.key = "input/hive/mapper_input"
json_records.set_contents_from_filename(root_path + 'resources/mapper_input')
input_key = Key(emr_demo_bucket)
input_key.key = "input/0/mapper_input"
input_key.set_contents_from_filename( root_path + 'resources/mapper_input')
mapper_key = Key(emr_demo_bucket)
mapper_key.key = "scripts/mapper_script.py"
mapper_key.set_contents_from_filename(root_path + 'src/EMRDemoMapper.py')
reducer_key = Key(emr_demo_bucket)
reducer_key.key = "scripts/reducer_script.py"
reducer_key.set_contents_from_filename( root_path + 'src/EMRDemoReducer.py')
demo_step = StreamingStep(name ='EMR Demo Example'
,mapper='s3://bearrito.demos.emr/scripts/mapper_script.py'
,reducer='s3://bearrito.demos.emr/scripts/reducer_script.py'
,input='s3://bearrito.demos.emr/input/0'
,output='s3://bearrito.demos.emr/output')
emr = boto.connect_emr()
jobid = emr.run_jobflow(name="EMR Example",log_uri='s3://bearrito.demos.logs',steps = [demo_step])
status = emr.describe_jobflow(jobid)
#log into AWS to monitor further
#!/usr/bin/env python
# encoding: utf-8
import decimal
import math
import sys
def record_to_decimal(sensor_record):
numerical_record = []
for record in sensor_record:
scaled = decimal.Decimal(record).quantize(decimal.Decimal('.001'),rounding=decimal.ROUND_DOWN)
numerical_record.append(scaled)
return numerical_record
def record_cleaner(record):
#print(record)
record = record.strip()
(super_key,record_date,c1,c2,c3) = record.split('\t')
key = super_key[super_key.index('Machine'):]
values = [c1,c2,c3]
#print(values)
return (key,values)
def compute_distance(target,data):
d0 = (target[0] - data[0])**2
d1 = (target[1] - data[1])**2
d2 = (target[2] - data[2])**2
return math.sqrt(d0 + d1 + d2)
def process(target,record) :
(key,values) = record_cleaner(record)
transformed_data = record_to_decimal(values)
distance = compute_distance(target,transformed_data)
output = '%s\t%s' % (key,distance)
print output
target = (decimal.Decimal(-1.53),decimal.Decimal(0.144),decimal.Decimal(1.99))
for record in sys.stdin:
process(target,record)
#!/usr/bin/env python
# encoding: utf-8
import decimal
import sys
def process(min_machine,min_distance,record) :
(key,value) = record.split("\t")
data = decimal.Decimal(value).quantize(decimal.Decimal('.001'),rounding=decimal.ROUND_DOWN)
if(min_machine == None) :
min_machine = key
min_distance = None
if(min_machine != key) :
print "%s\t%s" % (min_machine,min_distance)
min_machine = key
min_distance = None
if(min_distance == None or data < min_distance) :
min_distance = data
return (min_machine,min_distance)
min_distance = None
min_machine = None
for record in sys.stdin :
(min_machine,min_distance) = process(min_machine,min_distance,record)
print "%s\t%s" % (min_machine,min_distance)
#!/usr/bin/env python
# encoding: utf-8
import sys
for record in sys.stdin :
(key,value) = record.split('\t')
print "%s\t%s:%s" % ('agg',key,value)
#!/usr/bin/env python
# encoding: utf-8
import decimal
import sys
def process(min_machine,min,record) :
try :
(key,value) = record.split('\t')
(machine,distance) = value.split(':')
data = decimal.Decimal(distance).quantize(decimal.Decimal('.001'),rounding=decimal.ROUND_DOWN)
if(min == None or data < min) :
min = data
min_machine = machine
except :
a = 1
return (min_machine,min)
min = None
min_machine = None
for record in sys.stdin :
if(record != None or record != ""):
(min_machine,min) = process(min_machine,min,record)
print "%s\t%s" % (min_machine,min)
#!/usr/bin/env python
# encoding: utf-8
import boto
from boto.s3.key import Key
from boto.emr.step import StreamingStep
from boto.emr.bootstrap_action import BootstrapAction
root_path = '/home/barrett/Git/EMR-DEMO/code/'
s3 = boto.connect_s3()
emr_demo_bucket = s3.create_bucket('bearrito.demos.emr')
emr_demo_bucket.set_acl('private')
input_key = Key(emr_demo_bucket)
input_key.key = "input/0/mapper_input"
input_key.set_contents_from_filename(root_path + 'resources/mapper_input')
mapper_key = Key(emr_demo_bucket)
mapper_key.key = "scripts/bootstrap.sh"
mapper_key.set_contents_from_filename(root_path + 'src/BootStrap.sh')
bootstrap_step = BootstrapAction("bootstrap.sh",'s3://bearrito.demos.emr/scripts/bootstrap.sh',None)
mapper_key.key = "scripts/mapper_nearest_0.py"
mapper_key.set_contents_from_filename(root_path + 'src/EMRNearestMapper0.py')
mapper_key.key = "scripts/mapper_nearest_1.py"
mapper_key.set_contents_from_filename(root_path + 'src/EMRNearestMapper1.py')
reducer_key = Key(emr_demo_bucket)
reducer_key.key = "scripts/reducer_nearest_0.py"
reducer_key.set_contents_from_filename(root_path + 'src/EMRNearestReducer0.py')
reducer_key.key = "scripts/reducer_nearest_1.py"
reducer_key.set_contents_from_filename(root_path + 'src/EMRNearestReducer1.py')
nearest_0 = StreamingStep(name ='EMR First Phase'
,mapper='s3://bearrito.demos.emr/scripts/mapper_nearest_0.py'
,reducer='s3://bearrito.demos.emr/scripts/reducer_nearest_0.py'
,input='s3://bearrito.demos.emr/input/0'
,output='s3://bearrito.demos.emr/output/0')
nearest_1 = StreamingStep(name ='EMR Second Phase'
,mapper='s3://bearrito.demos.emr/scripts/mapper_nearest_1.py'
,reducer='s3://bearrito.demos.emr/scripts/reducer_nearest_1.py'
,input='s3://bearrito.demos.emr/output/0'
,output='s3://bearrito.demos.emr/output/1')
emr = boto.connect_emr()
jobid = emr.run_jobflow(name="EMR Two Phase"
,log_uri='s3://bearrito.demos.logs'
,steps = [nearest_0,nearest_1]
,bootstrap_actions=[bootstrap_step])
status = emr.describe_jobflow(jobid)
wget http://python.org/ftp/python/2.7.2/Python-2.7.2.tar.bz2
tar jfx Python-2.7.2.tar.bz2
cd Python-2.7.2
./configure --with-threads --enable-shared
make
sudo make install
sudo ln -s /usr/local/lib/libpython2.7.so.1.0 /usr/lib/
sudo ln -s /usr/local/lib/libpython2.7.so /usr/
SELECT sum(channel_1), count(channel_1)
FROM sensor_records
GROUPBY machine_name
ssh -i /home/barrett/EC2/MyKey.pem hadoop@ec2-.xxx.xxx.xx.xx.compute.amazonaws.com
hadoop@ec2$ hive
hive>
hive> show tables;
OK
Time Take: 12.02 seocnds
hive>
You should not have any tables yet
CREATE EXTERNAL TABLE sensor_records (
dq_machine_name string, record_date string, channel_1 float, channel_2 float, channel_3 float)
PARTITIONED BY (machine_name string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION 's3://bearrito.demos.emr/input/hive/Sensor/';
ALTER TABLE sensor_records RECOVER PARTITIONS;
SELECT COUNT(*) FROM sensor_records;
//These won't launch hadoop jobs.
//No real computation to perform when partitioned.
SELECT * FROM sensor_records LIMIT 10;
SELECT * FROM sensor_records
WHERE machine_name = 'Machine-1' LIMIT 10;
// This will launch jobs. Why is that?
SELECT * FROM sensor_records
WHERE machine_name = 'Machine-1' AND channel_1 < 0.0 LIMIT 10;
//Since we know the distribution of the channels by machine
//we can easily check that aggregation is correct
// Can anyone guess the result set? Think about the math.
SELECT machine_name, AVG(channel_1) FROM sensor_records GROUP BY machine_name;
$ pig
> pwd
> cd s3://bearrito.demos.emr/input/0
> ls
SENSOR_RECORDS = LOAD 's3://bearrito.demos.emr/input/0'
>as (machine_name:chararray,record_date:chararray,channel_1:float,channel_2:float,channel_3:floa
GRP_SR = GROUP SENSOR_RECORDS BY machine_name
AVG_GRP_SR = FOREACH GRP_SR GENERATE group ,AVG(SENSOR_RECORDS.channel_1);
FILTERED_C3_SR = FILTER SENSOR_RECORDS BY channel_3 > 10
SMPL_FLT_C3_SR = SAMPLE FILTERED_C3_SR .10;
TUPLE_SR = FOREACH SENSOR_RECORDS
>GENERATE machine_name,
>TOTUPLE (channel_1,channel_2,channel_3) as channel_tuple;
FLAT_TUPLE_SR = FOREACH TUPLE_SR
>GENERATE machine_name, FLATTEN(channel_tuple) ;