分享

Spark Python API函数学习:pyspark API(4) – 过往记忆

 dazheng 2015-11-05

countByKey

spark countByKey
1# countByKey
2x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
3y = x.countByKey()
4print(x.collect())
5print(y)
6 
7[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
8defaultdict(<type 'int'>, {'A': 3, 'B': 2})

join

spark join
01# join
02x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
03y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
04z = x.join(y)
05print(x.collect())
06print(y.collect())
07print(z.collect())
08 
09[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
10[('A', 8), ('B', 7), ('A', 6), ('D', 5)]
11[('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('B', (3, 7))]

leftOuterJoin

spark leftOuterJoin
01# leftOuterJoin
02x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
03y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
04z = x.leftOuterJoin(y)
05print(x.collect())
06print(y.collect())
07print(z.collect())
08 
09[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
10[('A', 8), ('B', 7), ('A', 6), ('D', 5)]
11[('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('C', (4, None)), ('B', (3, 7))]

rightOuterJoin

spark rightOuterJoin
01# rightOuterJoin
02x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
03y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
04z = x.rightOuterJoin(y)
05print(x.collect())
06print(y.collect())
07print(z.collect())
08 
09[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
10[('A', 8), ('B', 7), ('A', 6), ('D', 5)]
11[('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('B', (3, 7)), ('D', (None, 5))]

partitionBy

spark partitionBy
1# partitionBy
2x = sc.parallelize([(0,1),(1,2),(2,3)],2)
3y = x.partitionBy(numPartitions = 3, partitionFunc = lambda x: x)  # only key is passed to paritionFunc
4print(x.glom().collect())
5print(y.glom().collect())
6 
7[[(0, 1)], [(1, 2), (2, 3)]]
8[[(0, 1)], [(1, 2)], [(2, 3)]]

combineByKey

spark combineByKey
01# combineByKey
02x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
03createCombiner = (lambda el: [(el,el**2)])
04mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)]) # append to aggregated
05mergeComb = (lambda agg1,agg2: agg1 + agg2 )  # append agg1 with agg2
06y = x.combineByKey(createCombiner,mergeVal,mergeComb)
07print(x.collect())
08print(y.collect())
09 
10[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
11[('A', [(3, 9), (4, 16), (5, 25)]), ('B', [(1, 1), (2, 4)])]

aggregateByKey

spark aggregateByKey
01# aggregateByKey
02x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
03zeroValue = [] # empty list is 'zero value' for append operation
04mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)])
05mergeComb = (lambda agg1,agg2: agg1 + agg2 )
06y = x.aggregateByKey(zeroValue,mergeVal,mergeComb)
07print(x.collect())
08print(y.collect())
09 
10[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
11[('A', [(3, 9), (4, 16), (5, 25)]), ('B', [(1, 1), (2, 4)])]

foldByKey

spark foldByKey
1# foldByKey
2x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
3zeroValue = 1 # one is 'zero value' for multiplication
4y = x.foldByKey(zeroValue,lambda agg,x: agg*x )  # computes cumulative product within each key
5print(x.collect())
6print(y.collect())
7 
8[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
9[('A', 60), ('B', 2)]

groupByKey

spark groupByKey
1# groupByKey
2x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
3y = x.groupByKey()
4print(x.collect())
5print([(j[0],[i for i in j[1]]) for j in y.collect()])
6 
7[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)]
8[('A', [3, 2, 1]), ('B', [5, 4])]

flatMapValues

spark flatMapValues
1# flatMapValues
2x = sc.parallelize([('A',(1,2,3)),('B',(4,5))])
3y = x.flatMapValues(lambda x: [i**2 for i in x]) # function is applied to entire value, then result is flattened
4print(x.collect())
5print(y.collect())
6 
7[('A', (1, 2, 3)), ('B', (4, 5))]
8[('A', 1), ('A', 4), ('A', 9), ('B', 16), ('B', 25)]

mapValues

spark mapValues
1# mapValues
2x = sc.parallelize([('A',(1,2,3)),('B',(4,5))])
3y = x.mapValues(lambda x: [i**2 for i in x]) # function is applied to entire value
4print(x.collect())
5print(y.collect())
6 
7[('A', (1, 2, 3)), ('B', (4, 5))]
8[('A', [1, 4, 9]), ('B', [16, 25])]

groupWith

spark groupWith
01# groupWith
02x = sc.parallelize([('C',4),('B',(3,3)),('A',2),('A',(1,1))])
03y = sc.parallelize([('B',(7,7)),('A',6),('D',(5,5))])
04z = sc.parallelize([('D',9),('B',(8,8))])
05a = x.groupWith(y,z)
06print(x.collect())
07print(y.collect())
08print(z.collect())
09print("Result:")
10for key,val in list(a.collect()):
11    print(key, [list(i) for i in val])
12 
13[('C', 4), ('B', (3, 3)), ('A', 2), ('A', (1, 1))]
14[('B', (7, 7)), ('A', 6), ('D', (5, 5))]
15[('D', 9), ('B', (8, 8))]
16Result:
17D [[], [(5, 5)], [9]]
18C [[4], [], []]
19B [[(3, 3)], [(7, 7)], [(8, 8)]]
20A [[2, (1, 1)], [6], []]

cogroup

spark cogroup
01# cogroup
02x = sc.parallelize([('C',4),('B',(3,3)),('A',2),('A',(1,1))])
03y = sc.parallelize([('A',8),('B',7),('A',6),('D',(5,5))])
04z = x.cogroup(y)
05print(x.collect())
06print(y.collect())
07for key,val in list(z.collect()):
08    print(key, [list(i) for i in val])
09 
10[('C', 4), ('B', (3, 3)), ('A', 2), ('A', (1, 1))]
11[('A', 8), ('B', 7), ('A', 6), ('D', (5, 5))]
12A [[2, (1, 1)], [8, 6]]
13C [[4], []]
14B [[(3, 3)], [7]]
15D [[], [(5, 5)]]

sampleByKey

spark sampleByKey
1# sampleByKey
2x = sc.parallelize([('A',1),('B',2),('C',3),('B',4),('A',5)])
3y = x.sampleByKey(withReplacement=False, fractions={'A':0.5, 'B':1, 'C':0.2})
4print(x.collect())
5print(y.collect())
6 
7[('A', 1), ('B', 2), ('C', 3), ('B', 4), ('A', 5)]
8[('B', 2), ('C', 3), ('B', 4)]

subtractByKey

spark subtractByKey
01# subtractByKey
02x = sc.parallelize([('C',1),('B',2),('A',3),('A',4)])
03y = sc.parallelize([('A',5),('D',6),('A',7),('D',8)])
04z = x.subtractByKey(y)
05print(x.collect())
06print(y.collect())
07print(z.collect())
08 
09[('C', 1), ('B', 2), ('A', 3), ('A', 4)]
10[('A', 5), ('D', 6), ('A', 7), ('D', 8)]
11[('C', 1), ('B', 2)]

subtract

spark subtract
01# subtract
02x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
03y = sc.parallelize([('C',8),('A',2),('D',1)])
04z = x.subtract(y)
05print(x.collect())
06print(y.collect())
07print(z.collect())
08 
09[('C', 4), ('B', 3), ('A', 2), ('A', 1)]
10[('C', 8), ('A', 2), ('D', 1)]
11[('A', 1), ('C', 4), ('B', 3)]

keyBy

spark keyBy
1# keyBy
2x = sc.parallelize([1,2,3])
3y = x.keyBy(lambda x: x**2)
4print(x.collect())
5print(y.collect())
6 
7[1, 2, 3]
8[(1, 1), (4, 2), (9, 3)]

repartition

spark repartition
1# repartition
2x = sc.parallelize([1,2,3,4,5],2)
3y = x.repartition(numPartitions=3)
4print(x.glom().collect())
5print(y.glom().collect())
6 
7[[1, 2], [3, 4, 5]]
8[[], [1, 2, 3, 4], [5]]

coalesce

spark coalesce
1# coalesce
2x = sc.parallelize([1,2,3,4,5],2)
3y = x.coalesce(numPartitions=1)
4print(x.glom().collect())
5print(y.glom().collect())
6 
7[[1, 2], [3, 4, 5]]
8[[1, 2, 3, 4, 5]]

zip

spark zip
01# zip
02x = sc.parallelize(['B','A','A'])
03# zip expects x and y to have same #partitions and #elements/partition
04y = x.map(lambda x: ord(x)) 
05z = x.zip(y)
06print(x.collect())
07print(y.collect())
08print(z.collect())
09 
10['B', 'A', 'A']
11[66, 65, 65]
12[('B', 66), ('A', 65), ('A', 65)]

zipWithIndex

spark zipWithIndex
1# zipWithIndex
2x = sc.parallelize(['B','A','A'],2)
3y = x.zipWithIndex()
4print(x.glom().collect())
5print(y.collect())
6 
7[['B'], ['A', 'A']]
8[('B', 0), ('A', 1), ('A', 2)]

zipWithUniqueId

spark zipWithUniqueId
1# zipWithUniqueId
2x = sc.parallelize(['B','A','A'],2)
3y = x.zipWithUniqueId()
4print(x.glom().collect())
5print(y.collect())
6 
7[['B'], ['A', 'A']]
8[('B', 0), ('A', 1), ('A', 3)]

PDF版下载

点击进入下载

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多