MAX(is_backfilled_userId::int)::boolean AS is_backfilled_userId,
MAX(is_visit_of_signup::int)::boolean AS is_visit_of_signup,
MAX(is_visitor_first_visit::int)::boolean AS is_visitor_first_visit,
CAST(MAX(CAST(is_backfilled_userId AS int)) AS boolean) AS is_backfilled_userId,
CAST(MAX(CAST(is_visit_of_signup AS int)) AS boolean) AS is_visit_of_signup,
CAST(MAX(CAST(is_visitor_first_visit AS int)) AS boolean) AS is_visitor_first_visit,Use explicit casts to change data type
Must cast string to int before casting as boolean
)
GROUP BY 1,2,3,4,5
;
) x
GROUP BY
visit_created, visitId, visit_hash, ip_address, visitorId;Alias all subquery tables
Group by field name
WHERE created > (SELECT start_ts FROM temp_time)
AND created < (SELECT end_date FROM temp_time)
--don't consider Udemy IP addresses to be bots:
AND ip NOT IN ('4.53.135.250','178.251.45.20','89.101.248.170','4.16.193.102')
GROUP BY 1,2
HAVING COUNT(*)>1000 AND COUNT(DISTINCT visitorId)<5
) botChange HAVING in Group BY
To WHERE clause
WHERE cast(created as date) = '{datestamp}'
AND ip NOT IN ('4.53.135.250','178.251.45.20','89.101.248.170','4.16.193.102')
GROUP BY cast(created as date),ip
) a
WHERE count_ids > 1000
AND count_visitorID < 5
) botLEFT JOIN raw_data.u_user u ON ex.usitorId=u.id
AND ex.has_userId=TRUE AND u.created>=ex.usitor_test_created
LEFT JOIN (SELECT MIN(experiment_created) AS first_created
FROM s3_ab_tracked_experiment) mn
ON 1=1
WHERE
v.visit_created>mn.first_created
AND ab.created>=ex.experiment_created
LEFT JOIN s3_u_user u
ON ex.has_userId = 1
AND ex.usitorId=u.id
Rewrite joins using greater than or less than
DELETE FROM test_table WHERE datestamp = '2016-03-31';
INSERT INTO test_table select * from other_test_table;INSERT OVERWRITE TABLE test_table PARTITION(datestamp='2016-03-31')
SELECT * FROM other_test_table;Insert overwrite handles delete and write
Partitions ensure no duplication
CREATE TABLE analytics.visit_to_traffic_attribution
(
visit_created timestamp encode lzo,,
userId integer encode lzo,
is_backfilled_userId boolean encode raw,
is_visit_of_signup boolean encode runlength,
is_visitor_first_visit boolean encode runlength,
traffic_attribution_is_retargeting boolean encode runlength,
traffic_page_type varchar(75) encode lzo,
traffic_landing_url varchar(500) encode lzo,
traffic_landing_courseId integer encode lzo,
PRIMARY KEY(visitId)
)
DISTKEY(visitorId)
SORTKEY(visitId,visit_created,userId);
Data Type Conversions
Remove encoding
Partitions replace distkey
DROP TABLE IF EXISTS visit_to_traffic_attribution;
CREATE TABLE visit_to_traffic_attribution (
visit_created timestamp,
is_backfilled_userid boolean,
is_visit_of_signup boolean,
is_visitor_first_visit boolean,
traffic_attribution_is_retargeting boolean,
traffic_page_type varchar(75),
traffic_landing_url string,
traffic_landing_courseid int)
PARTITIONED BY (datestamp date)
STORED AS SEQUENCEFILE;