Wrangling

Mixed

Author

Sungkyun Cho

Published

October 12, 2024

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

NYC Taxi and Limousine Commission Record Data

taxi = sns.load_dataset("taxis")
taxi
                  pickup             dropoff  passengers  distance  fare  tip  \
0    2019-03-23 20:21:09 2019-03-23 20:27:24           1      1.60  7.00 2.15   
1    2019-03-04 16:11:55 2019-03-04 16:19:00           1      0.79  5.00 0.00   
2    2019-03-27 17:53:01 2019-03-27 18:00:25           1      1.37  7.50 2.36   
...                  ...                 ...         ...       ...   ...  ...   
6430 2019-03-23 22:55:18 2019-03-23 23:14:25           1      4.14 16.00 0.00   
6431 2019-03-04 10:09:25 2019-03-04 10:14:29           1      1.12  6.00 0.00   
6432 2019-03-13 19:31:22 2019-03-13 19:48:02           1      3.85 15.00 3.36   

      tolls  total   color      payment            pickup_zone  \
0      0.00  12.95  yellow  credit card        Lenox Hill West   
1      0.00   9.30  yellow         cash  Upper West Side South   
2      0.00  14.16  yellow  credit card          Alphabet City   
...     ...    ...     ...          ...                    ...   
6430   0.00  17.30   green         cash    Crown Heights North   
6431   0.00   6.80   green  credit card          East New York   
6432   0.00  20.16   green  credit card            Boerum Hill   

                      dropoff_zone pickup_borough dropoff_borough  
0              UN/Turtle Bay South      Manhattan       Manhattan  
1            Upper West Side South      Manhattan       Manhattan  
2                     West Village      Manhattan       Manhattan  
...                            ...            ...             ...  
6430                Bushwick North       Brooklyn        Brooklyn  
6431  East Flatbush/Remsen Village       Brooklyn        Brooklyn  
6432               Windsor Terrace       Brooklyn        Brooklyn  

[6433 rows x 14 columns]
taxi.describe(include="object")
         color      payment     pickup_zone           dropoff_zone  \
count     6433         6389            6407                   6388   
unique       2            2             194                    203   
top     yellow  credit card  Midtown Center  Upper East Side North   
freq      5451         4577             230                    245   

       pickup_borough dropoff_borough  
count            6407            6388  
unique              4               5  
top         Manhattan       Manhattan  
freq             5268            5206  

열의 이름 수정

rename()

taxi.rename(columns={"passengers": "persons", "payment": "pay"}, inplace=True)
taxi.head(3)
               pickup             dropoff  persons  distance  fare  tip  \
0 2019-03-23 20:21:09 2019-03-23 20:27:24        1      1.60  7.00 2.15   
1 2019-03-04 16:11:55 2019-03-04 16:19:00        1      0.79  5.00 0.00   
2 2019-03-27 17:53:01 2019-03-27 18:00:25        1      1.37  7.50 2.36   

   tolls  total   color          pay            pickup_zone  \
0   0.00  12.95  yellow  credit card        Lenox Hill West   
1   0.00   9.30  yellow         cash  Upper West Side South   
2   0.00  14.16  yellow  credit card          Alphabet City   

            dropoff_zone pickup_borough dropoff_borough  
0    UN/Turtle Bay South      Manhattan       Manhattan  
1  Upper West Side South      Manhattan       Manhattan  
2           West Village      Manhattan       Manhattan  

값의 대체

idx_cash = (taxi["pay"] == "cash")  # Boolean index

# 직접 값을 대입
taxi.loc[idx_cash, "tip"] = np.nan  # missing(NA) values

np.where()의 활용
(boolean condition, value if True, value if False)

taxi["tip"] = np.where(idx_cash, np.nan, taxi["tip"])

map()의 활용
dictionary로 입력

taxi["pay"] = taxi["pay"].map({"credit card": "Card", "cash": "Cash"})
taxi.head(3)
               pickup             dropoff  persons  distance  fare  tip  \
0 2019-03-23 20:21:09 2019-03-23 20:27:24        1      1.60  7.00 2.15   
1 2019-03-04 16:11:55 2019-03-04 16:19:00        1      0.79  5.00  NaN   
2 2019-03-27 17:53:01 2019-03-27 18:00:25        1      1.37  7.50 2.36   

   tolls  total   color   pay            pickup_zone           dropoff_zone  \
0   0.00  12.95  yellow  Card        Lenox Hill West    UN/Turtle Bay South   
1   0.00   9.30  yellow  Cash  Upper West Side South  Upper West Side South   
2   0.00  14.16  yellow  Card          Alphabet City           West Village   

  pickup_borough dropoff_borough  
0      Manhattan       Manhattan  
1      Manhattan       Manhattan  
2      Manhattan       Manhattan  

새로운 변수 생성

기본적인 Series들의 연산

taxi["total2"] = taxi["total"] - taxi["tip"]

assign()의 활용

taxi = taxi.assign(
    tip_ratio = lambda x: x["tip"] / x["total2"],
    tip_ratio_per = lambda x: x.tip_ratio / x.persons  # dot notation
)
taxi.head(3)
               pickup             dropoff  persons  distance  fare  tip  \
0 2019-03-23 20:21:09 2019-03-23 20:27:24        1      1.60  7.00 2.15   
1 2019-03-04 16:11:55 2019-03-04 16:19:00        1      0.79  5.00  NaN   
2 2019-03-27 17:53:01 2019-03-27 18:00:25        1      1.37  7.50 2.36   

   tolls  total   color   pay            pickup_zone           dropoff_zone  \
0   0.00  12.95  yellow  Card        Lenox Hill West    UN/Turtle Bay South   
1   0.00   9.30  yellow  Cash  Upper West Side South  Upper West Side South   
2   0.00  14.16  yellow  Card          Alphabet City           West Village   

  pickup_borough dropoff_borough  total2  tip_ratio  tip_ratio_per  
0      Manhattan       Manhattan   10.80       0.20           0.20  
1      Manhattan       Manhattan     NaN        NaN            NaN  
2      Manhattan       Manhattan   11.80       0.20           0.20  
# total2의 NA값 원래대로 복구
taxi["total2"] = np.where(idx_cash, taxi["total"], taxi["total2"])

잠시, 택시요금과 팁 간의 관계를 살펴보면,

taxi.plot.scatter(x="total2", y="tip", alpha=.2, figsize=(5, 3))
plt.show()

taxi.plot.scatter(x="total2", y="tip_ratio", alpha=.2, figsize=(5, 3))
plt.show()

(
    so.Plot(taxi, x='total2', y='tip_ratio')
    .add(so.Dot(alpha=.5, pointsize=2))
    .add(so.Line(color=".2"), so.PolyFit(5))
    .facet("persons", wrap=4)
    .layout(size=(8, 5))
    .label(title="{} passengers".format)
)

필터링

query()의 활용

Conditional operators
>, >=, <, <=,
== (equal to), != (not equal to)
and, & (and)
or, | (or)
not, ~ (not)
in (includes), not in (not included)

(
    taxi.query('color == "green" & pickup_borough == "Manhattan"')
    .head(3)
)
                  pickup             dropoff  persons  distance  fare  tip  \
5453 2019-03-29 18:12:27 2019-03-29 18:20:40        1      1.51  7.50 1.20   
5454 2019-03-06 11:11:33 2019-03-06 11:15:15        1      0.45  4.50  NaN   
5465 2019-03-27 20:55:35 2019-03-27 21:01:55        1      1.43  7.00 3.00   

      tolls  total  color   pay        pickup_zone           dropoff_zone  \
5453   0.00  10.50  green  Card  East Harlem North      East Harlem South   
5454   0.00   5.30  green  Cash  East Harlem North      East Harlem South   
5465   0.00  11.30  green  Card     Central Harlem  Upper West Side North   

     pickup_borough dropoff_borough  total2  tip_ratio  tip_ratio_per  
5453      Manhattan       Manhattan    9.30       0.13           0.13  
5454      Manhattan       Manhattan    5.30        NaN            NaN  
5465      Manhattan       Manhattan    8.30       0.36           0.36  

정렬

(
    taxi.query('persons > 1')
    .sort_values("tip_ratio", ascending=False)
)
                  pickup             dropoff  persons  distance  fare  tip  \
2923 2019-03-06 23:17:06 2019-03-06 23:24:12        2      1.60  7.50 7.00   
3272 2019-03-28 11:15:04 2019-03-28 11:21:01        3      0.62  5.50 5.00   
4912 2019-03-27 18:04:45 2019-03-27 18:11:07        2      0.55  5.50 5.00   
...                  ...                 ...      ...       ...   ...  ...   
6405 2019-03-21 18:28:55 2019-03-21 18:51:08        2      3.88 16.50  NaN   
6412 2019-03-20 17:33:25 2019-03-20 17:42:48        5      1.40  8.00  NaN   
6420 2019-03-16 15:39:23 2019-03-16 15:46:18        2      1.20  7.00  NaN   

      tolls  total   color   pay                   pickup_zone  \
2923   0.00  18.30  yellow  Card  Penn Station/Madison Sq West   
3272   0.00  13.80  yellow  Card      Financial District North   
4912   0.00  14.80  yellow  Card                   Murray Hill   
...     ...    ...     ...   ...                           ...   
6405   0.00  18.30   green  Cash                   Boerum Hill   
6412   0.00   9.80   green  Cash              Brooklyn Heights   
6420   0.00   7.80   green  Cash                Central Harlem   

               dropoff_zone pickup_borough dropoff_borough  total2  tip_ratio  \
2923              Hudson Sq      Manhattan       Manhattan   11.30       0.62   
3272           Battery Park      Manhattan       Manhattan    8.80       0.57   
4912    UN/Turtle Bay South      Manhattan       Manhattan    9.80       0.51   
...                     ...            ...             ...     ...        ...   
6405             Ocean Hill       Brooklyn        Brooklyn   18.30        NaN   
6412            Fort Greene       Brooklyn        Brooklyn    9.80        NaN   
6420  Upper West Side North      Manhattan       Manhattan    7.80        NaN   

      tip_ratio_per  
2923           0.31  
3272           0.19  
4912           0.26  
...             ...  
6405            NaN  
6412            NaN  
6420            NaN  

[1659 rows x 17 columns]

Grouping

  • groupby()로 데이터를 의미있는 그룹으로 나눈 후, 다음과 같은 통계치를 계산
  • size(), count(), sum(), mean(), min(), max()

Source: Ch.10 in Python for Data Analysis (3e) by Wes McKinney

taxi.groupby("color").size()  # size(): 열의 개수
color
green      982
yellow    5451
dtype: int64
taxi.groupby(["color", "pay"]).size()  # size(): 열의 개수
color   pay 
green   Card     577
        Cash     400
yellow  Card    4000
        Cash    1412
dtype: int64
taxi.groupby(["color", "pay"])["total2"].mean()
color   pay 
green   Card   18.76
        Cash   11.40
yellow  Card   17.08
        Cash   15.61
Name: total2, dtype: float64
df =(
    taxi.groupby(["color", "pay"])["total2"].mean()
    .unstack()  # wide format으로 변환
)
df
pay     Card  Cash
color             
green  18.76 11.40
yellow 17.08 15.61
df["total"] = df.sum(axis=1)
df
pay     Card  Cash  total
color                    
green  18.76 11.40  30.15
yellow 17.08 15.61  32.68
df.assign(
    Card_pct = lambda x: x.Card / x.total * 100,
    Cash_pct = lambda x: x.Cash / x.total * 100
)
pay     Card  Cash  total  Card_pct  Cash_pct
color                                        
green  18.76 11.40  30.15     62.21     37.79
yellow 17.08 15.61  32.68     52.25     47.75

여러 함수를 동시에 적용: agg()

taxi.groupby(["color", "pay"])["total2"].agg(["mean", "std", "size"])
             mean   std  size
color  pay                   
green  Card 18.76 14.73   577
       Cash 11.40 10.41   400
yellow Card 17.08 12.00  4000
       Cash 15.61 12.38  1412

가장 일반적인 방식으로 appy()를 사용하여 사용자 정의 함수를 적용

# standardize
def standardize(x):
    return (x - x.mean()) / x.std()

taxi.groupby("color", group_keys=True)["tip_ratio"].apply(standardize)
color       
green   5451     NaN
        5452   -0.95
        5453    0.23
                ... 
yellow  5448    0.33
        5449    0.33
        5450   -1.12
Name: tip_ratio, Length: 6433, dtype: float64

시간 데이터의 처리

시간을 표시하는 datetime64 타입을 이용해 시간 데이터를 처리하며,
dt accessor를 사용

# pickup 시간으로부터 요일을 추출
taxi["day"] = taxi["pickup"].dt.day_name().str[:3]  # 요일의 앞 3글자
# pickup 시간으로부터 시간대를 추출
taxi["hour"] = taxi["pickup"].dt.hour
# 택시를 탄 시간(분)을 계산
taxi["duration"] = (taxi["dropoff"] - taxi["pickup"]).dt.total_seconds() / 60
taxi.head(3)
               pickup             dropoff  persons  distance  fare  tip  \
0 2019-03-23 20:21:09 2019-03-23 20:27:24        1      1.60  7.00 2.15   
1 2019-03-04 16:11:55 2019-03-04 16:19:00        1      0.79  5.00  NaN   
2 2019-03-27 17:53:01 2019-03-27 18:00:25        1      1.37  7.50 2.36   

   tolls  total   color   pay            pickup_zone           dropoff_zone  \
0   0.00  12.95  yellow  Card        Lenox Hill West    UN/Turtle Bay South   
1   0.00   9.30  yellow  Cash  Upper West Side South  Upper West Side South   
2   0.00  14.16  yellow  Card          Alphabet City           West Village   

  pickup_borough dropoff_borough  total2  tip_ratio  tip_ratio_per  day  hour  \
0      Manhattan       Manhattan   10.80       0.20           0.20  Sat    20   
1      Manhattan       Manhattan    9.30        NaN            NaN  Mon    16   
2      Manhattan       Manhattan   11.80       0.20           0.20  Wed    17   

   duration  
0      6.25  
1      7.08  
2      7.40  

요일과 시간대별로 팁의 비율은 다른가?

taxi.groupby(["day", "hour"])["tip_ratio"].mean()
day  hour
Fri  0      0.17
     1      0.19
     2      0.12
            ... 
Wed  21     0.18
     22     0.19
     23     0.23
Name: tip_ratio, Length: 167, dtype: float64

pivot_table()를 활용할 수도 있음

taxi.pivot_table("tip_ratio", "hour", "day")
day   Fri  Mon  Sat  Sun  Thu  Tue  Wed
hour                                   
0    0.17 0.19 0.17 0.18 0.17 0.22 0.19
1    0.19  NaN 0.18 0.18 0.20 0.20 0.18
2    0.12 0.18 0.18 0.19 0.22 0.20 0.17
...   ...  ...  ...  ...  ...  ...  ...
21   0.17 0.18 0.15 0.18 0.18 0.17 0.18
22   0.17 0.17 0.18 0.18 0.16 0.18 0.19
23   0.18 0.22 0.17 0.19 0.19 0.18 0.23

[24 rows x 7 columns]
# day를 categorical 타입으로 변환
taxi["day"] = pd.Categorical(taxi["day"], categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
df = taxi.pivot_table("tip_ratio", "hour", "day")

fig, axes = plt.subplots(6, 4, figsize=(8, 8), sharex=True, sharey=True)
fig.subplots_adjust(wspace=0.1, hspace=0.3)

for i, ax in enumerate(axes.flat):
    df.iloc[i].plot.bar(ax=ax, ylim=(0.1, 0.23))
    ax.set_title(f"hour={i}", fontsize=9)

taxi_day_hour = (
    taxi.groupby(["day", "hour"])["tip_ratio"]
    .agg(["mean", "size"])
    .reset_index()
    .rename(columns={"mean": "tip_ratio", "size": "n"})
)
taxi_day_hour
     day  hour  tip_ratio   n
0    Mon     0       0.19  13
1    Mon     1        NaN   2
2    Mon     2       0.18   6
..   ...   ...        ...  ..
165  Sun    21       0.18  42
166  Sun    22       0.18  32
167  Sun    23       0.19  23

[168 rows x 4 columns]
(
    so.Plot(taxi_day_hour, y='day', x='tip_ratio', color='n')
    .add(so.Bar(width=.5))
    .facet("hour", wrap=6)
    .layout(size=(8, 5.5))
    .limit(x=(0.12, 0.23))
    .theme({'ytick.labelsize': 9})
)

데이터 프레임들의 결합

Merge

  • merge()를 사용하여 두 데이터 프레임을 join
  • Key에 해당하는 변수들의 값이 매치되는 행들을 찾아 결합
  • 결합 방식: “inner”, “left”, “right”, “outer”

Inner join의 예:

nyc_neighborhood.csv dataset

nyc = pd.read_csv("data/nyc_neighborhood.csv")
nyc
         borough  units  labor  carfree  density      pop
0      Manhattan   5083   0.68     0.88    71.90  1628706
1          Bronx   6120   0.59     0.71    33.70  1418207
2       Brooklyn  10129   0.64     0.75    36.90  2559903
3         Queens   6752   0.64     0.59    20.70  2253858
4  Staten Island    582   0.62     0.34     8.30   476143
(
    taxi.merge(
        nyc, left_on="pickup_borough", right_on="borough", how="left"
    ).head(3)
)
               pickup             dropoff  persons  distance  fare  tip  \
0 2019-03-23 20:21:09 2019-03-23 20:27:24        1      1.60  7.00 2.15   
1 2019-03-04 16:11:55 2019-03-04 16:19:00        1      0.79  5.00  NaN   
2 2019-03-27 17:53:01 2019-03-27 18:00:25        1      1.37  7.50 2.36   

   tolls  total   color   pay  ... tip_ratio_per  day hour duration  \
0   0.00  12.95  yellow  Card  ...          0.20  Sat   20     6.25   
1   0.00   9.30  yellow  Cash  ...           NaN  Mon   16     7.08   
2   0.00  14.16  yellow  Card  ...          0.20  Wed   17     7.40   

     borough   units  labor carfree  density        pop  
0  Manhattan 5083.00   0.68    0.88    71.90 1628706.00  
1  Manhattan 5083.00   0.68    0.88    71.90 1628706.00  
2  Manhattan 5083.00   0.68    0.88    71.90 1628706.00  

[3 rows x 26 columns]

Merge를 데이터 필터링에 사용할 수도 있음

taxi_day_hour
     day  hour  tip_ratio   n
0    Mon     0       0.19  13
1    Mon     1        NaN   2
2    Mon     2       0.18   6
..   ...   ...        ...  ..
165  Sun    21       0.18  42
166  Sun    22       0.18  32
167  Sun    23       0.19  23

[168 rows x 4 columns]
days_top_ratio = (
    taxi_day_hour
    .query('n > 10')
    .nlargest(3, "tip_ratio")
)
days_top_ratio
    day  hour  tip_ratio   n
71  Wed    23       0.23  39
23  Mon    23       0.22  22
24  Tue     0       0.22  13

팁의 비율이 가장 높았던 3개의 (요일, 시간대) 조합에 해당하는 taxi 데이터셋을 추리려면

taxi.merge(days_top_ratio, on=["day", "hour"])
                pickup             dropoff  persons  distance  fare  tip  \
0  2019-03-25 23:05:54 2019-03-25 23:11:13        1      0.80  5.50 2.30   
1  2019-03-27 23:10:02 2019-03-27 23:22:11        3      2.77 11.00 3.70   
2  2019-03-20 23:19:55 2019-03-20 23:46:00        1      4.82 19.50 4.66   
..                 ...                 ...      ...       ...   ...  ...   
71 2019-03-18 23:48:38 2019-03-18 23:56:36        1      2.00  8.50  NaN   
72 2019-03-06 23:32:50 2019-03-06 23:34:31        1      0.30  3.50 2.50   
73 2019-03-26 00:06:16 2019-03-26 00:12:46        1      0.86  6.00  NaN   

    tolls  total   color   pay  ... pickup_borough dropoff_borough total2  \
0    0.00  11.60  yellow  Card  ...      Manhattan       Manhattan   9.30   
1    0.00  18.50  yellow  Card  ...      Manhattan       Manhattan  14.80   
2    0.00  27.96  yellow  Card  ...      Manhattan       Manhattan  23.30   
..    ...    ...     ...   ...  ...            ...             ...    ...   
71   0.00   9.80   green  Cash  ...         Queens          Queens   9.80   
72   0.00   7.30   green  Card  ...         Queens          Queens   4.80   
73   0.00   7.30   green  Cash  ...      Manhattan       Manhattan   7.30   

   tip_ratio_x  tip_ratio_per  day  hour duration  tip_ratio_y   n  
0         0.25           0.25  Mon    23     5.32         0.22  22  
1         0.25           0.08  Wed    23    12.15         0.23  39  
2         0.20           0.20  Wed    23    26.08         0.23  39  
..         ...            ...  ...   ...      ...          ...  ..  
71         NaN            NaN  Mon    23     7.97         0.22  22  
72        0.52           0.52  Wed    23     1.68         0.23  39  
73         NaN            NaN  Tue     0     6.50         0.22  13  

[74 rows x 22 columns]

Concatenate

pd.concat([df1, df2, ...], axis=)
행과 열의 index를 매치시켜 두 DataFrame/Series를 합침

df1 = pd.DataFrame(
    np.arange(6).reshape(3, 2), index=["a", "b", "c"], columns=["one", "two"]
)
df2 = pd.DataFrame(
    5 + np.arange(4).reshape(2, 2), index=["a", "c"], columns=["three", "four"]
)
   one  two
a    0    1
b    2    3
c    4    5
   three  four
a      5     6
c      7     8
pd.concat([df1, df2], axis=1)
   one  two  three  four
a    0    1   5.00  6.00
b    2    3    NaN   NaN
c    4    5   7.00  8.00
pd.concat([df1, df2])  # default: axis=0
   one  two  three  four
a 0.00 1.00    NaN   NaN
b 2.00 3.00    NaN   NaN
c 4.00 5.00    NaN   NaN
a  NaN  NaN   5.00  6.00
c  NaN  NaN   7.00  8.00