+
+
=
RDD[T]
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
case class Artist(name: String, age: Int)
val defaultArtists = Seq(
Artist("Offset", 25),
Artist("Kanye West", 39),
Artist("Frank Ocean", 29),
Artist("John Mayer", 39),
Artist("Aretha Franklin", 74),
Artist("Kendrick Lamar", 29),
Artist("Carly Rae Jepsen", 31))
val spark = SparkSession.builder().master("local[*]").getOrCreate
val artists = spark.sparkContext.parallelize(defaultArtists)
val (totalAge, totalCount) = artists
.map(a => (a.age, 1))
.reduce { case ((a1, c1), (a2, c2)) => (a1 + a2, c1 + c2) }
println(s"Average age: ${totalAge.toDouble / totalCount.toDouble}")
////
scala> Average age: 38.0
DataFrame
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.avg
val artists: DataFrame = spark.createDataFrame(defaultArtists)
artists.agg(avg("age")).show
+--------+
|avg(age)|
+--------+
| 38.0|
+--------+
artists.select("genre").show // throws an exception :(
org.apache.spark.sql.AnalysisException: cannot resolve '`genre`' given input columns: [name, age];;
'Project ['genre]
+- LocalRelation [name#0, age#1]
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:77)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)
Dataset[T]
import org.apache.spark.sql.Dataset
import spark.implicits._ // import default Encoders
val artists: Dataset[Artist] = spark.createDataset(defaultArtists)
artists
.filter(_.age > 30) // typed API, like Scala collections/RDD
.agg(avg("age")).show // untyped API from DataFrame
+--------+
|avg(age)|
+--------+
| 45.75|
+--------+
artists.select("genre").show // hmm, still throws an exception...
org.apache.spark.sql.AnalysisException: cannot resolve '`genre`' given input columns: [name, age];;
'Project ['genre]
+- LocalRelation [name#2, age#3]
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:77)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)
import frameless._
implicit val sqlContext = spark.sqlContext // required for frameless
val artists: TypedDataset[Artist] = TypedDataset.create(defaultArtists)
artists
.filter(_.age > 30)
.select(avg(artists('age))) // typechecked column name!
.show().run // explicit `.run`
+-----+
| _1|
+-----+
|45.75|
+-----+
artists.filter(_.age > 30).select(artists('name)).show().run
+----------------+
| _1|
+----------------+
| Kanye West|
| John Mayer|
| Aretha Franklin|
|Carly Rae Jepsen|
+----------------+
artists.select(artists('blah)) // doesn't compile
case class AgeCount(age: Int, count: Long)
artists
.groupBy(artists('age))
.agg(count(artists('age)))
.as[AgeCount] // compile-time `.as`!
.filter(_.count > 1)
.show().run
+---+-----+
|age|count|
+---+-----+
| 39| 2|
| 29| 2|
+---+-----+
"Unable to find encoder for type stored in a Dataset.
Primitive types (Int, String, etc) and Product types (case classes)
are supported by importing spark.implicits._
Support for serializing other types will be added in future releases"
sealed abstract class Genre
object Genre {
case object HipHop extends Genre
case object RnB extends Genre
case object Soul extends Genre
case object Pop extends Genre
case object Rock extends Genre
}
case class ArtistWithGenre(artist: Artist, genre: Genre)
// Won't compile:
// could not find implicit value for parameter encoder:
// frameless.TypedEncoder[examples.ArtistWithGenre]
val artistsWithGenre: TypedDataset[ArtistWithGenre] = TypedDataset.create(Seq(
ArtistWithGenre(Artist("Offset", 25), Genre.HipHop),
ArtistWithGenre(Artist("Kanye West", 39), Genre.HipHop),
ArtistWithGenre(Artist("Frank Ocean", 29), Genre.RnB),
ArtistWithGenre(Artist("John Mayer", 39), Genre.Rock),
ArtistWithGenre(Artist("Aretha Franklin", 74), Genre.Soul),
ArtistWithGenre(Artist("Kendrick Lamar", 29), Genre.HipHop),
ArtistWithGenre(Artist("Carly Rae Jepsen", 31), Genre.Pop)))
// define an implicit Injection and frameless will use it
// to create a TypedEncoder
implicit val genreInjection = new Injection[Genre, Int] {
def apply(genre: Genre): Int = genre match {
case Genre.HipHop => 1
case Genre.RnB => 2
case Genre.Soul => 3
case Genre.Pop => 4
case Genre.Rock => 5
}
def invert(i: Int): Genre = i match {
case 1 => Genre.HipHop
case 2 => Genre.RnB
case 3 => Genre.Soul
case 4 => Genre.Pop
case 5 => Genre.Rock
}
}
import cats.Eq
import cats.implicits._
implicit val genreEq: Eq[Genre] = new Eq[Genre] {
def eqv(g1: Genre, g2: Genre): Boolean = g1 == g2
}
// Compiles!
val artistsWithGenre: TypedDataset[ArtistWithGenre] = TypedDataset.create(Seq(
ArtistWithGenre(Artist("Offset", 25), Genre.HipHop),
ArtistWithGenre(Artist("Kanye West", 39), Genre.HipHop),
ArtistWithGenre(Artist("Frank Ocean", 29), Genre.RnB),
ArtistWithGenre(Artist("John Mayer", 39), Genre.Rock),
ArtistWithGenre(Artist("Aretha Franklin", 74), Genre.Soul),
ArtistWithGenre(Artist("Kendrick Lamar", 29), Genre.HipHop),
ArtistWithGenre(Artist("Carly Rae Jepsen", 31), Genre.Pop)))
artistsWithGenre.filter(_.genre === Genre.HipHop).show().run
+-------------------+-----+
| artist|genre|
+-------------------+-----+
| [Offset,25]| 1|
| [Kanye West,39]| 1|
|[Kendrick Lamar,29]| 1|
+-------------------+-----+
(Maybe there's a way out, but it's not obvious.)
¯\_(ツ)_/¯
type safe columns