Contents

SQL queries in pandas- Advanced

How to manipulate data with both SQL and pandas - Advanced

Advanced

set up the dataset

1
2
3
4
5
6
7
8
import pandas as pd
df = pd.DataFrame({'name': ['A', 'A', 'A', 'B', 'B'],
                   'company': ['TT', 'KK', 'SS', 'XX', 'SS'],
                   'year': [2019, 2019,
                                2020, 2019,
                                2020],
                   'time': [4, 18, 10, 6, 10]})
df

name company year time
0 A TT 2019 4
1 A KK 2019 18
2 A SS 2020 10
3 B XX 2019 6
4 B SS 2020 10

Window functions

LEAD() LAG()

1
2
3
4
5
6
7
SELECT name
       , company
       , year
       , time
       , LEAD(year, 2) OVER(ORDER BY year, name) AS lead2
       , LAG(year, 3) OVER(ORDER BY year, name) AS lag3
FROM df



is same as

1
2
3
df.sort_values(['year', 'name']).assign(
              lead2 = df['year'].shift(-2),
              lag3 = df['year'].shift(3))

name company year time lead2 lag3
0 A TT 2019 4 2020.0 NaN
1 A KK 2019 18 2019.0 NaN
3 B XX 2019 6 NaN 2019.0
2 A SS 2020 10 2020.0 NaN
4 B SS 2020 10 NaN 2019.0

ROW_NUMBER(), RANK(), DENSE_RANK()

1
2
3
4
5
6
7
8
SELECT name
       , company
       , year
       , time
       , ROW_NUMBER() OVER(ORDER BY time, name) AS row_number_d
       , RANK() OVER(ORDER BY time) AS rank_d
       , DENSE_RANK() OVER(ORDER BY time) AS dense_rank_d
FROM df



1
2
3
4
df.sort_values(['time', 'name']).assign(
    row_number_d = df['time'].rank(method='first').astype(int),
    rank_d = df['time'].rank(method='min').astype(int),
    dense_rank_d = df['time'].rank(method='dense').astype(int))

name company year time row_number_d rank_d dense_rank_d
0 A TT 2019 4 1 1 1
3 B XX 2019 6 2 2 2
2 A SS 2020 10 3 3 3
4 B SS 2020 10 4 3 3
1 A KK 2019 18 5 5 4

Aggregate and partitioning

1
2
3
4
5
6
7
8
9
SELECT name
       , company
       , year
       , time
       , AVG(time) OVER(PARTITION BY name) AS avg_time_by_name
       , SUM(time) OVER(PARTITION BY name ORDER BY year
                            RANGE BETWEEN UNBOUNDED PRECEDING
                            AND CURRENT ROW) AS cum_sum_time_by_name
FROM df



1
2
3
4
5
df.assign(avg_time_by_name=df.groupby('name')['time']
                         .transform('mean'),
          cum_sum_time_by_name=df.sort_values('year')
                             .groupby('name')['time']
                             .transform('cumsum'))

name company year time avg_time_by_name cum_sum_time_by_name
0 A TT 2019 4 10.666667 4
1 A KK 2019 18 10.666667 22
2 A SS 2020 10 10.666667 32
3 B XX 2019 6 8.000000 6
4 B SS 2020 10 8.000000 16

More

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
SELECT ROW_NUMBER() OVER(PARTITION BY name 
                         ORDER BY year) AS number
       , name
       , company
       , year
       , LEAD(year) OVER(PARTITION BY NAME 
                             ORDER BY year) AS next_year
       , DATEDIFF(DAY, 
                  year, 
                  LEAD(year) OVER(PARTITION BY NAME 
                                      ORDER BY year)) AS gap
       , time
       , AVG(time) OVER(PARTITION BY name) AS avg_time_by_name
       , SUM(time) OVER(PARTITION BY name ORDER BY year
                            RANGE BETWEEN UNBOUNDED PRECEDING 
                            AND CURRENT ROW) AS cum_sum_time_by_name   
FROM df
ORDER BY name, year

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
# Sort by name and year
df.sort_values(['name', 'year'], inplace=True)
# Append new columns
df = df.assign(number=df.groupby('name')['year']
                        .rank('min')
                        .astype(int),
               next_year = df.groupby('name')['year']
                                 .transform(lambda x: x.shift(-1)),
               gap = df.groupby('name')['year']
                       .transform(lambda x: x.shift(-1))-
                                            df['year'],
               avg_time_by_name = df.groupby('name')['time']
                                .transform(lambda x: x.mean()),
               cum_sum_time_by_name = df.groupby('name')['time']
                                   .transform(lambda x: x.cumsum()))
# Reorder columns
df[['number', 'name', 'company', 
           'year',  'next_year', 
           'gap', 'time',  'avg_time_by_name', 
           'cum_sum_time_by_name']]

number name company year next_year gap time avg_time_by_name cum_sum_time_by_name
0 1 A TT 2019 2019.0 0.0 4 10.666667 4
1 1 A KK 2019 2020.0 1.0 18 10.666667 22
2 3 A SS 2020 NaN NaN 10 10.666667 32
3 1 B XX 2019 2020.0 1.0 6 8.000000 6
4 2 B SS 2020 NaN NaN 10 8.000000 16