Professional Documents
Culture Documents
What is PySpark?
Natively Scala
PySpark Basics
Data is big
PySpark Basics
PySpark Basics
The
brown
dog
brown
dog
(The, 1)
(brown, 1)
(dog, 1)
(brown, 1)
(dog, 1)
Shuffle
Reduce
Collect
(The, 1)
(dog, 2)
(brown, 2)
(The, 1)
(dog, 2)
(brown, 2)
Demo
# example 1
text = "the brown dog jumped over the other brown dog"
text_rdd = sc.parallelize(text.split(' '))
text_rdd.map(lambda word: (word, 1)) \
.reduceByKey(lambda left, right: left + right).collect()
# example 2
import string
time_machine = sc.textFile('/user/jasonwhite/time_machine')
time_machine_tuples = time_machine.flatMap(lambda line: line.lower().split(' ')) \
.map(lambda word: ''.join(ch for ch in word if ch in string.letters)) \
.filter(lambda word: word != '') \
.map(lambda word: (word, 1))
word_counts = time_machine_tuples.reduceByKey(lambda left, right: left + right)
Monoids
Examples:
addition of integers
Demo
# example 3
dataset = sc.parallelize([
{'id': 1, 'value': 1},
{'id': 2, 'value': 2},
{'id': 2, 'value': 6}
])
def add_tuples(left, right):
left_sum, left_count = left
right_sum, right_count = right
return (left_sum + right_sum, left_count + right_count)
averages = dataset.map(lambda d: (d['value'], 1)) \
.reduce(add_tuples)
averages_by_key = dataset.map(lambda d: (d['id'], (d['value'], 1))) \
.reduceByKey(add_tuples) \
.map(lambda (key, (sum, count)): (key, sum * 1.0 / count))
# example 4
from datetime import date
dataset = sc.parallelize([
{'id': 1, 'group_id': 10,
{'id': 2, 'group_id': 10,
{'id': 3, 'group_id': 10,
{'id': 4, 'group_id': 11,
{'id': 5, 'group_id': 11,
])
Demo
'timestamp':
'timestamp':
'timestamp':
'timestamp':
'timestamp':
date(1978,
date(1984,
date(1986,
date(1956,
date(1953,
3,
3,
5,
6,
2,
2)},
24)},
19)},
5)},
21)},
def calculate_age(d):
d['age'] = (date.today() - d['timestamp']).days()
return d
def calculate_group_stats(left, right):
earliest = min(left['earliest'], right['earliest'])
latest = max(left['latest'], right['latest'])
total_age = left['total_age'] + right['total_age']
count = left['count'] + right['count']
return {
'earliest': earliest,
'latest': latest,
'total_age': total_age,
'count': left['count'] + right[count']
}
group_stats = dataset.map(calculate_age) \
.map(lambda d: (d['group_id'], {'earliest': d['timestamp'], 'latest':
d['timestamp'], 'total_age': d['age'], 'count': 1})) \
.reduceByKey(calculate_group_stats)
Joining RDDs
Joining RDDs
{id: 1, field1: foo}
{id: 2, field1: bar}
Demo
# example 4
first_dataset = sc.parallelize([
{'id': 1, 'field1': 'foo'},
{'id': 2, 'field1': 'bar'},
{'id': 2, 'field1': 'baz'},
{'id': 3, 'field1': 'foo'}
])
first_dataset = first_dataset.map(lambda d: (d['id'], d))
second_dataset = sc.parallelize([
{'id': 1, 'field2': 'abc'},
{'id': 2, 'field2': 'def'}
])
second_dataset = second_dataset.map(lambda d: (d['id'], d))
output = first_dataset.join(second_dataset)
Key Skew
Demo
# example 5
first_dataset = sc.parallelize([
{'id': 1, 'field1': 'foo'},
{'id': 2, 'field1': 'bar'},
{'id': 2, 'field1': 'baz'},
{'id': 3, 'field1': 'foo'}
])
first_dataset = first_dataset.map(lambda d: (d['id'], d))
second_dataset = sc.parallelize([
{'id': 1, 'field2': 'abc'},
{'id': 2, 'field2': 'def'}
])
second_dataset = second_dataset.map(lambda d: (d['id'], d))
second_dict = sc.broadcast(second_dataset.collectAsMap())
def join_records((key, record)):
if key in second_dict.value.keys():
yield (key, (record, second_dict.value[key]))
output = first_dataset.flatMap(join_records)
Ordering
Solution: repartitionAndSortWithinPartitions
Ordering
{id: 1, value: 10}
{id: 2, value: 10}
{id: 3, value: 20}
MapPartitions
{id: 1, interval: 5}
{id: 1, interval: 2}
{id: 2, interval: 5}
Thanks!