glom() : returns an RDD with one array per partition.
Allows the worker to access all data in it's partition.
A=sc.parallelize(range(1000000))\
.map(lambda x: (2*x,x)) \
.partitionBy(10) \
.glom() # One list per key \
print A.getNumPartitions()
def variation(B):
d=0
if len(B)>1:
for i in range(len(B)-1):
d+=abs(B[i+1][1]-B[i][1]) # access the glomed RDD that is now a list
return (B[0][0],len(B),d)
else:
return(None)
output=A.map(lambda B: variation(B)).collect()
print output
10
[(0, 200000, 999995), None, (2, 200000, 999995), None,
(4, 200000, 999995), None, (6, 200000, 999995), None,
(8, 200000, 999995), None]