How to manipulate data with both SQL and pandas - Advanced
Advanced
set up the dataset
1
2
3
4
5
6
7
8
|
import pandas as pd
df = pd.DataFrame({'name': ['A', 'A', 'A', 'B', 'B'],
'company': ['TT', 'KK', 'SS', 'XX', 'SS'],
'year': [2019, 2019,
2020, 2019,
2020],
'time': [4, 18, 10, 6, 10]})
df
|
|
name |
company |
year |
time |
0 |
A |
TT |
2019 |
4 |
1 |
A |
KK |
2019 |
18 |
2 |
A |
SS |
2020 |
10 |
3 |
B |
XX |
2019 |
6 |
4 |
B |
SS |
2020 |
10 |
Window functions
LEAD() LAG()
1
2
3
4
5
6
7
|
SELECT name
, company
, year
, time
, LEAD(year, 2) OVER(ORDER BY year, name) AS lead2
, LAG(year, 3) OVER(ORDER BY year, name) AS lag3
FROM df
|
is same as
1
2
3
|
df.sort_values(['year', 'name']).assign(
lead2 = df['year'].shift(-2),
lag3 = df['year'].shift(3))
|
|
name |
company |
year |
time |
lead2 |
lag3 |
0 |
A |
TT |
2019 |
4 |
2020.0 |
NaN |
1 |
A |
KK |
2019 |
18 |
2019.0 |
NaN |
3 |
B |
XX |
2019 |
6 |
NaN |
2019.0 |
2 |
A |
SS |
2020 |
10 |
2020.0 |
NaN |
4 |
B |
SS |
2020 |
10 |
NaN |
2019.0 |
ROW_NUMBER(), RANK(), DENSE_RANK()
1
2
3
4
5
6
7
8
|
SELECT name
, company
, year
, time
, ROW_NUMBER() OVER(ORDER BY time, name) AS row_number_d
, RANK() OVER(ORDER BY time) AS rank_d
, DENSE_RANK() OVER(ORDER BY time) AS dense_rank_d
FROM df
|
1
2
3
4
|
df.sort_values(['time', 'name']).assign(
row_number_d = df['time'].rank(method='first').astype(int),
rank_d = df['time'].rank(method='min').astype(int),
dense_rank_d = df['time'].rank(method='dense').astype(int))
|
|
name |
company |
year |
time |
row_number_d |
rank_d |
dense_rank_d |
0 |
A |
TT |
2019 |
4 |
1 |
1 |
1 |
3 |
B |
XX |
2019 |
6 |
2 |
2 |
2 |
2 |
A |
SS |
2020 |
10 |
3 |
3 |
3 |
4 |
B |
SS |
2020 |
10 |
4 |
3 |
3 |
1 |
A |
KK |
2019 |
18 |
5 |
5 |
4 |
Aggregate and partitioning
1
2
3
4
5
6
7
8
9
|
SELECT name
, company
, year
, time
, AVG(time) OVER(PARTITION BY name) AS avg_time_by_name
, SUM(time) OVER(PARTITION BY name ORDER BY year
RANGE BETWEEN UNBOUNDED PRECEDING
AND CURRENT ROW) AS cum_sum_time_by_name
FROM df
|
1
2
3
4
5
|
df.assign(avg_time_by_name=df.groupby('name')['time']
.transform('mean'),
cum_sum_time_by_name=df.sort_values('year')
.groupby('name')['time']
.transform('cumsum'))
|
|
name |
company |
year |
time |
avg_time_by_name |
cum_sum_time_by_name |
0 |
A |
TT |
2019 |
4 |
10.666667 |
4 |
1 |
A |
KK |
2019 |
18 |
10.666667 |
22 |
2 |
A |
SS |
2020 |
10 |
10.666667 |
32 |
3 |
B |
XX |
2019 |
6 |
8.000000 |
6 |
4 |
B |
SS |
2020 |
10 |
8.000000 |
16 |
More
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
SELECT ROW_NUMBER() OVER(PARTITION BY name
ORDER BY year) AS number
, name
, company
, year
, LEAD(year) OVER(PARTITION BY NAME
ORDER BY year) AS next_year
, DATEDIFF(DAY,
year,
LEAD(year) OVER(PARTITION BY NAME
ORDER BY year)) AS gap
, time
, AVG(time) OVER(PARTITION BY name) AS avg_time_by_name
, SUM(time) OVER(PARTITION BY name ORDER BY year
RANGE BETWEEN UNBOUNDED PRECEDING
AND CURRENT ROW) AS cum_sum_time_by_name
FROM df
ORDER BY name, year
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
# Sort by name and year
df.sort_values(['name', 'year'], inplace=True)
# Append new columns
df = df.assign(number=df.groupby('name')['year']
.rank('min')
.astype(int),
next_year = df.groupby('name')['year']
.transform(lambda x: x.shift(-1)),
gap = df.groupby('name')['year']
.transform(lambda x: x.shift(-1))-
df['year'],
avg_time_by_name = df.groupby('name')['time']
.transform(lambda x: x.mean()),
cum_sum_time_by_name = df.groupby('name')['time']
.transform(lambda x: x.cumsum()))
# Reorder columns
df[['number', 'name', 'company',
'year', 'next_year',
'gap', 'time', 'avg_time_by_name',
'cum_sum_time_by_name']]
|
|
number |
name |
company |
year |
next_year |
gap |
time |
avg_time_by_name |
cum_sum_time_by_name |
0 |
1 |
A |
TT |
2019 |
2019.0 |
0.0 |
4 |
10.666667 |
4 |
1 |
1 |
A |
KK |
2019 |
2020.0 |
1.0 |
18 |
10.666667 |
22 |
2 |
3 |
A |
SS |
2020 |
NaN |
NaN |
10 |
10.666667 |
32 |
3 |
1 |
B |
XX |
2019 |
2020.0 |
1.0 |
6 |
8.000000 |
6 |
4 |
2 |
B |
SS |
2020 |
NaN |
NaN |
10 |
8.000000 |
16 |