Luca Ghislotti
Computer Engineering Student @UniBG
<Luca Parimbelli - 1053142>
<Luca Ghislotti - 1052975>
<Alessandro Mazzola - 1053121>
<Andrea Marinò - 1053230>
<Christian Spano - 1053028># ADD IDX OF WATCH NEXT VIDEOS
watch_next_dataset_agg = watch_next_dataset /
.groupBy(col("idx").alias("idx_ref")) /
.agg(collect_list("url") /
.alias("watch_next_url")) /
watch_next_dataset_agg.printSchema()
tedx_dataset_agg = tedx_dataset /
.join(watch_next_dataset_agg, tedx_dataset.idx == watch_next_dataset_agg.idx_ref, "left") /
.drop("idx_ref").select(col("idx").alias("_id"), col("*")).drop("idx") /
tedx_dataset_agg.printSchema()
# ADD INFO ABOUT SPEAKER
tedx_dataset_agg = tedx_dataset_agg /
.join(speaker_info_dataset, tedx_dataset_agg.main_speaker == speaker_info_dataset.name_speaker, "left") /
.select(col("*"), struct(col("name_speaker") /
.alias("speaker_name"), col("speaker_link") /
.alias("speaker_url"), col("profession") /
.alias("speaker_profession"), col("info_about") /
.alias("info")).alias("speaker")).drop("name_speaker") /
tedx_dataset_agg.printSchema()# ADD IDX OF WATCH NEXT VIDEOS
watch_next_dataset_agg = watch_next_dataset /
.groupBy(col("idx").alias("idx_ref")) /
.agg(collect_list("url") /
.alias("watch_next_url")) /
watch_next_dataset_agg.printSchema()
tedx_dataset_agg = tedx_dataset /
.join(watch_next_dataset_agg, tedx_dataset.idx == watch_next_dataset_agg.idx_ref, "left") /
.drop("idx_ref").select(col("idx").alias("_id"), col("*")).drop("idx") /
tedx_dataset_agg.printSchema()
# ADD INFO ABOUT SPEAKER
tedx_dataset_agg = tedx_dataset_agg /
.join(speaker_info_dataset, tedx_dataset_agg.main_speaker == speaker_info_dataset.name_speaker, "left") /
.select(col("*"), struct(col("name_speaker") /
.alias("speaker_name"), col("speaker_link") /
.alias("speaker_url"), col("profession") /
.alias("speaker_profession"), col("info_about") /
.alias("info")).alias("speaker")).drop("name_speaker") /
tedx_dataset_agg.printSchema()watch_next_dataset_path = "s3://tedindex-data/watch_next_dataset.csv"
watch_next_dataset = spark.read.option("header","true").csv(watch_next_dataset_path).dropDuplicates()
watch_next_dataset.printSchema()By Luca Ghislotti
2020 mobile and cloud technologies project