In this demo, we will:
head -n 1000 Enigma-JHU.csv > Enigma-JHU-1000.csv
fips,admin2,province_state,country_region,last_update,latitude,longitude,confirmed,deaths,recovered,active,combined_key
,,Anhui,China,2020-01-22T17:00:00,31.826,117.226,1,,,,"Anhui, China"
,,Beijing,China,2020-01-22T17:00:00,40.182,116.414,14,,,,"Beijing, China"
,,Chongqing,China,2020-01-22T17:00:00,30.057,107.874,6,,,,"Chongqing, China"
,,Fujian,China,2020-01-22T17:00:00,26.079,117.987,1,,,,"Fujian, China"
,,Gansu,China,2020-01-22T17:00:00,36.061,103.834,,,,,"Gansu, China"
code Enigma-JHU-1000.csv
covid19-data-lake-demo-917212
CREATE EXTERNAL TABLE covid_data (
fips STRING,
admin2 STRING,
province_state STRING,
country_region STRING,
last_update STRING,
latitude STRING,
longitude STRING,
confirmed STRING,
deaths STRING,
recovered STRING,
active STRING,
combined_key STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
STORED AS TEXTFILE
LOCATION 's3://covid19-data-lake-demo-917212/'
TBLPROPERTIES ('skip.header.line.count'='1');
SELECT * FROM covid_data LIMIT 10;
SELECT
country_region,
SUM(CASE
WHEN confirmed = '' THEN 0
ELSE CAST(confirmed AS BIGINT)
END) as total_cases,
SUM(CASE
WHEN deaths = '' THEN 0
ELSE CAST(deaths AS BIGINT)
END) as total_deaths
FROM covid_data
GROUP BY country_region
ORDER BY total_cases DESC
LIMIT 10;
DROP TABLE IF EXISTS covid_data;
permanently delete