In this demo, we will:
emr-demo-401201
scripts
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, col
# Initialize Spark session
spark = SparkSession.builder.appName("EMR Demo Job").getOrCreate()
# Read input data from S3
input_path = "s3://emr-demo-401201/input/sample_data.csv"
df = spark.read.csv(input_path, header=True, inferSchema=True)
# Calculate sum of prices by category
result = df.groupBy("category") \
.agg(sum(col("price") * col("quantity")).alias("total_sales")) \
.orderBy(col("total_sales").desc())
# Write results back to S3
output_path = "s3://emr-demo-401201/output/results"
result.write.mode("overwrite").parquet(output_path)
# Show results (for demo purposes)
result.show()
# Stop the Spark session
spark.stop()
input
category,product,price,quantity,date
Electronics,Smartphone,599.99,2,2024-01-01
Home & Kitchen,Coffee Maker,79.99,1,2024-01-01
Clothing,T-Shirt,19.99,3,2024-01-01
Books,Science Fiction Novel,14.99,1,2024-01-02
Electronics,Laptop,1299.99,1,2024-01-02
Home & Kitchen,Blender,49.99,1,2024-01-02
Clothing,Jeans,59.99,2,2024-01-03
Sports & Outdoors,Yoga Mat,29.99,1,2024-01-03
Electronics,Wireless Earbuds,129.99,1,2024-01-03
Home & Kitchen,Toaster,34.99,1,2024-01-04
Clothing,Dress,89.99,1,2024-01-04
Books,Cookbook,24.99,2,2024-01-04
Electronics,Tablet,399.99,1,2024-01-05
Home & Kitchen,Air Fryer,99.99,1,2024-01-05
Clothing,Sweater,49.99,2,2024-01-05
Sports & Outdoors,Dumbbells Set,79.99,1,2024-01-06
Electronics,Smart Watch,199.99,1,2024-01-06
Home & Kitchen,Electric Kettle,39.99,1,2024-01-06
Clothing,Jacket,129.99,1,2024-01-07
Books,Mystery Novel,12.99,3,2024-01-07
Electronics,Digital Camera,549.99,1,2024-01-07
Home & Kitchen,Food Processor,89.99,1,2024-01-08
Clothing,Shorts,29.99,2,2024-01-08
Sports & Outdoors,Tennis Racket,119.99,1,2024-01-08
Electronics,Gaming Console,499.99,1,2024-01-09
Home & Kitchen,Microwave,149.99,1,2024-01-09
Clothing,Socks,9.99,5,2024-01-09
Books,Biography,19.99,2,2024-01-10
Electronics,Bluetooth Speaker,79.99,1,2024-01-10
Home & Kitchen,Slow Cooker,59.99,1,2024-01-10
Clothing,Scarf,24.99,2,2024-01-11
Sports & Outdoors,Bicycle,299.99,1,2024-01-11
Electronics,Monitor,249.99,1,2024-01-11
Home & Kitchen,Vacuum Cleaner,199.99,1,2024-01-12
Clothing,Gloves,14.99,3,2024-01-12
Books,Self-Help Book,16.99,2,2024-01-12
Electronics,Printer,159.99,1,2024-01-13
Home & Kitchen,Rice Cooker,49.99,1,2024-01-13
Clothing,Hat,19.99,2,2024-01-13
Sports & Outdoors,Camping Tent,199.99,1,2024-01-14
Electronics,External Hard Drive,89.99,1,2024-01-14
Home & Kitchen,Dish Set,69.99,1,2024-01-14
Clothing,Swimsuit,39.99,1,2024-01-15
Books,Travel Guide,22.99,1,2024-01-15
Electronics,Wireless Mouse,29.99,2,2024-01-15
Home & Kitchen,Cutlery Set,44.99,1,2024-01-16
Clothing,Belt,24.99,1,2024-01-16
Sports & Outdoors,Soccer Ball,19.99,2,2024-01-16
Electronics,Fitness Tracker,99.99,1,2024-01-17
Home & Kitchen,Stand Mixer,249.99,1,2024-01-17
EMR-Demo-Cluster
hadoop
My-Spark-App