How to manipulate data with both SQL and pandas - Basics
Basic SQL
Data view
SQL
|
pandas
|
1
2
|
select top 5 *
from df
|
|
|
Filter
SQL
|
pandas
|
1
2
3
|
select top 5 *
from df
where day = 'MON'
|
|
1
2
3
|
df[df.days=="MON"]
df.query("day=='MON'")
df.loc[df.day=="MON"]
|
|
1
2
3
|
select top 5 *
from df
where day in ('MON', 'TUE')
|
|
1
2
|
df.query("day in (['MON','TUE'])")
df[df.day.isin(["MON", "TUE"])]
|
|
1
2
3
|
select top 5 *
from df
where day like 'M%'
|
|
1
2
|
df.query("day.str.startswith('M)")
df[df.day.str.match(r'[Mm].')]
|
|
1
2
3
|
select top 5 *
from df
where num between 5 and 6
|
|
1
2
|
df[df.num.between(5,6)]
df.query("5<=num<=6")
|
|
1
2
3
|
select top 5 *
from df
where num is null
|
|
1
2
|
df[df.num.isna()]
df.query("num.isna()")
|
|
1
2
3
|
select top 5 *
from df
where num >5 and size <6
|
|
1
2
3
|
df[(df.num>5)&(df.size<6>)]
df.query("num>5 and size<6")
df.query("num>5 & size<6")
|
|
Sorting
SQL
|
pandas
|
1
2
3
|
select *
from df
order by no
|
|
|
1
2
3
|
select *
from df
order by no desc, size
|
|
1
2
|
df.sort_values(['no', 'size'], ascending=[0,1])
df.sort_values(['no', 'size'], ascending=[False,True])
|
|
Aggregate
SQL
|
pandas
|
1
2
3
|
select avg(no) as avg_no,
stdev(no) as std_no
from df
|
|
1
2
|
df[["no"]].agg(["mean", "std"])
df.agg({'no':["mean", "std"]})
|
|
Aggregate by group
SQL
|
pandas
|
1
2
3
4
|
select day,
avg(no) as avg_no
from df
group by day
|
|
1
|
df.groupby("day")["no"].agg("mean")
|
|