histogram
01 | # histogram (example #1) |
02 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
03 | y = x.histogram(buckets = 2 ) |
10 | # histogram (example #2) |
11 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
12 | y = x.histogram([ 0 , 0.5 , 1 , 1.5 , 2 , 2.5 , 3 , 3.5 ]) |
17 | ([ 0 , 0.5 , 1 , 1.5 , 2 , 2.5 , 3 , 3.5 ], [ 0 , 0 , 2 , 0 , 1 , 0 , 2 ]) |
mean
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
variance
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.variance() # divides by N |
stdev
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.stdev() # divides by N |
sampleStdev
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.sampleStdev() # divides by N-1 |
sampleVariance
2 | x = sc.parallelize([ 1 , 3 , 2 ]) |
3 | y = x.sampleVariance() # divides by N-1 |
countByValue
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
8 | defaultdict(< type 'int' >, { 1 : 2 , 2 : 1 , 3 : 2 }) |
top
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
takeOrdered
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
3 | y = x.takeOrdered(num = 3 ) |
take
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
first
2 | x = sc.parallelize([ 1 , 3 , 1 , 2 , 3 ]) |
collectAsMap
2 | x = sc.parallelize([( 'C' , 3 ),( 'A' , 1 ),( 'B' , 2 )]) |
7 | [( 'C' , 3 ), ( 'A' , 1 ), ( 'B' , 2 )] |
8 | { 'A' : 1 , 'C' : 3 , 'B' : 2 } |
keys
2 | x = sc.parallelize([( 'C' , 3 ),( 'A' , 1 ),( 'B' , 2 )]) |
7 | [( 'C' , 3 ), ( 'A' , 1 ), ( 'B' , 2 )] |
values
2 | x = sc.parallelize([( 'C' , 3 ),( 'A' , 1 ),( 'B' , 2 )]) |
7 | [( 'C' , 3 ), ( 'A' , 1 ), ( 'B' , 2 )] |
reduceByKey
2 | x = sc.parallelize([( 'B' , 1 ),( 'B' , 2 ),( 'A' , 3 ),( 'A' , 4 ),( 'A' , 5 )]) |
3 | y = x.reduceByKey( lambda agg, obj: agg + obj) |
7 | [( 'B' , 1 ), ( 'B' , 2 ), ( 'A' , 3 ), ( 'A' , 4 ), ( 'A' , 5 )] |
reduceByKeyLocally
2 | x = sc.parallelize([( 'B' , 1 ),( 'B' , 2 ),( 'A' , 3 ),( 'A' , 4 ),( 'A' , 5 )]) |
3 | y = x.reduceByKeyLocally( lambda agg, obj: agg + obj) |
7 | [( 'B' , 1 ), ( 'B' , 2 ), ( 'A' , 3 ), ( 'A' , 4 ), ( 'A' , 5 )] |
|