(I really like coffee
and
whisky)
Yamazaki Distillery
Little Nap Coffee Stand
val ds: Dataset[MyType] = spark.read.parquet(...)
ds.repartition(ds.rdd.getNumPartitions * 2)
.map(slowFn)
val slowBroadcast = dataset
.map(slowFunction)
.joinWith(broadcast(otherDataset), joinKey)
val aggregation = dataset.repartition(1000)
.groupBy(...)
.agg(...)
.coalesce(1) // will force upstream computations to 1 partition/task,
// .repartition(1) instead
val veryLongQuery = dataset.groupBy(...)
.agg(...)
.map(processRow)
.joinWith(anotherDataset)
.filter(filterFn)
.groupBy(...)
.agg(...)
// Break up long query with caching and counting (low overhead strict action)
val query1 = dataset.groupBy(...)
.agg.cache()
query1.count
val query2 = query1.map(processRow)
.joinWith(anotherDataset).cache()
query2.count
// etc
case class MyData(a: String, b: Int) {
val expensiveMember = { ... } // gets reinitialized
}