Exploratory Data Analysis

Mixed

Author

Sungkyun Cho

Published

October 12, 2024

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

용어 정리

변수들 간의 관계에 있어서, 예측에 사용되는 변수(X)와 예측되는 변수(Y)를 구분하기 위해 여러 용어들이 사용됨.

X: feature, predictor, independent variable
Y: target/label, response, dependent variable, outcome variable, criterion

다음 네 가지의 시각화 패키지를 사용해서 그 차이를 확인해 볼 것임.

Matplotlib
pandas
seaborn
seaborn.objects

Matplotlib 방식
두 가지 interface를 제공하는데, 혼동을 야기함

MATLAB 스타일로 직접 함수를 호출하는 방법
객체를 만들어서 메서드를 호출하는 방법
각 변수의 값을 직접 입력: Series나 NumPy array

pandas/seaborn 방식

DataFrame의 변수 이름을 사용해서 mapping
값을 직접 입력하는 것도 허용

California Housing Prices

Source: Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow by Aurélien Géron

# import data
url = "https://raw.githubusercontent.com/ageron/data/main/housing/housing.csv"
housing = pd.read_csv(url)

# discretize median age
housing["median_age_cat"] = pd.cut(housing["housing_median_age"], bins=[0, 10, 20, 30, 40, np.inf], labels=["0-10", "10-20", "20-30", "30-40", "40-52"])

housing.head()

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88               41.00       880.00          129.00   
1    -122.22     37.86               21.00      7099.00         1106.00   
2    -122.24     37.85               52.00      1467.00          190.00   
3    -122.25     37.85               52.00      1274.00          235.00   
4    -122.25     37.85               52.00      1627.00          280.00   

   population  households  median_income  median_house_value ocean_proximity  \
0      322.00      126.00           8.33           452600.00        NEAR BAY   
1     2401.00     1138.00           8.30           358500.00        NEAR BAY   
2      496.00      177.00           7.26           352100.00        NEAR BAY   
3      558.00      219.00           5.64           341300.00        NEAR BAY   
4      565.00      259.00           3.85           342200.00        NEAR BAY   

  median_age_cat  
0          40-52  
1          20-30  
2          40-52  
3          40-52  
4          40-52

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           20640 non-null  float64 
 1   latitude            20640 non-null  float64 
 2   housing_median_age  20640 non-null  float64 
 3   total_rooms         20640 non-null  float64 
 4   total_bedrooms      20433 non-null  float64 
 5   population          20640 non-null  float64 
 6   households          20640 non-null  float64 
 7   median_income       20640 non-null  float64 
 8   median_house_value  20640 non-null  float64 
 9   ocean_proximity     20640 non-null  object  
 10  median_age_cat      20640 non-null  category
dtypes: category(1), float64(9), object(1)
memory usage: 1.6+ MB

pd.options.display.max_rows = 8  # max number of rows to display

housing.describe()  # summary statistics for numerical columns

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
count   20640.00  20640.00            20640.00     20640.00        20433.00   
mean     -119.57     35.63               28.64      2635.76          537.87   
std         2.00      2.14               12.59      2181.62          421.39   
min      -124.35     32.54                1.00         2.00            1.00   
25%      -121.80     33.93               18.00      1447.75          296.00   
50%      -118.49     34.26               29.00      2127.00          435.00   
75%      -118.01     37.71               37.00      3148.00          647.00   
max      -114.31     41.95               52.00     39320.00         6445.00   

       population  households  median_income  median_house_value  
count    20640.00    20640.00       20640.00            20640.00  
mean      1425.48      499.54           3.87           206855.82  
std       1132.46      382.33           1.90           115395.62  
min          3.00        1.00           0.50            14999.00  
25%        787.00      280.00           2.56           119600.00  
50%       1166.00      409.00           3.53           179700.00  
75%       1725.00      605.00           4.74           264725.00  
max      35682.00     6082.00          15.00           500001.00

# 카테고리형 변수에 대한 검토
housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

위도, 경도 값의 활용

Show Matplotlib styles

plt.style.available

# set the style
plt.style.use('seaborn-v0_8-whitegrid')

lat, lon = housing['latitude'], housing['longitude']

## MATLAB 스타일
# figure() 함수를 직접 호출
plt.figure(figsize=(7, 5)) # create a plot figure, figsize는 생략가능

# scatter() 함수를 직접 호출
plt.scatter(x=lon, y=lat, label=None, edgecolors="w", linewidths=.4, alpha=0.3)

# set the labels
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.axis('equal') # set the aspect of the plot to be equal

plt.show()

## 객체 방식
# figure, axes라는 객체를 생성 후 메서드를 호출
fig, ax = plt.subplots(figsize=(7, 5)) 

# ax의 메서드인 .scatter로 그래프를 그림
ax.scatter(x=lon, y=lat, label=None, edgecolors="w", linewidths=.4, alpha=0.3)

# ax의 메서드인 .set_xlabel, .set_ylabel로 라벨을 지정
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
ax.axis('equal')  # set the aspect of the plot to be equal

plt.show()

# pandas의 plot 메서드를 사용하는 방식
housing.plot.scatter(x="longitude", y="latitude", alpha=0.3)

plt.axis('equal') # set the aspect of the plot to be equal
plt.show()

# 다음과 동일함
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.3)

plt.axis('equal') # set the aspect of the plot to be equal
plt.show()

pandas가 제공하는 plots

‘line’ : line plot (default)
‘bar’ : vertical bar plot
‘barh’ : horizontal bar plot
‘hist’ : histogram
‘box’ : boxplot
‘kde’ : Kernel Density Estimation plot
‘density’ : same as ‘kde’
‘area’ : area plot
‘pie’ : pie plot
‘scatter’ : scatter plot (DataFrame only)
‘hexbin’ : hexbin plot (DataFrame only)

# NEAR OCEAN에 해당하는 부분만 시각화
housing2 = housing.query('ocean_proximity == "NEAR OCEAN"')

housing2.plot.scatter(x="longitude", y="latitude", alpha=0.3, figsize=(7, 5))

plt.axis('equal') # set the aspect of the plot to be equal
plt.show()

# Seaborn을 사용하는 방식
plt.figure(figsize=(7, 5))
sns.scatterplot(housing, x="longitude", y="latitude", hue="ocean_proximity", alpha=0.5)

plt.axis('equal') # set the aspect of the plot to be equal
plt.show()

The San Francisco Bay Area

집값과의 관계를 보기 위해, 집값을 컬러에 매핑하면,

housing.plot.scatter(
    x="longitude",
    y="latitude",
    s=housing["population"] / 100,  # point size
    c="median_house_value",  # color
    alpha=0.3,  # transparency
    cmap="flare",  # color map
    figsize=(7, 5),
)

plt.axis('equal') # set the aspect of the plot to be equal
plt.show()

Text Annotation 추가

아래 코드를 추가하여 도시 이름을 표시

path = "https://raw.githubusercontent.com/jakevdp/PythonDataScienceHandbook/master/notebooks_v1/data/california_cities.csv"
cities = pd.read_csv(path)

popular_cities = cities.query('population_total > 400000')
lat, lon, names = popular_cities['latd'], popular_cities['longd'], popular_cities["city"]

plt.scatter(lon, lat, c="w", alpha=1)
for name, lat, lon in zip(names, lat, lon):
    plt.annotate(name, (lon, lat), xytext=(5, 5), textcoords="offset points", color="k")

Color 사용에 관한 체계적 가이드

Choosing color pallettes from Seaborn website

데이터의 분포

Histogram, density plot, boxplot

# pandas의 DataFrame 메서드인 hist()를 사용
housing.hist(bins=50, figsize=(9, 6))
plt.show()

# density plot
housing.plot.density(bw_method=0.2, subplots=True, layout=(3, 3), sharex=False, sharey=False, figsize=(9, 6))
plt.show()

# Using matplotlib
fig, ax = plt.subplots(3, 3, figsize=(9, 6))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i in range(3):
    for j in range(3):
        ax[i, j].hist(housing.iloc[:, i * 3 + j], bins=30)
        ax[i, j].set_title(housing.columns[i * 3 + j])

# 한 변수의 각 레벨/카테고리별로 그리기, using pandas
housing.plot.hist(column=["median_house_value"], by="ocean_proximity", sharey=False, sharex=True, figsize=(6, 8), bins=50)
plt.show()

Boxplot

source: R for Data Science

# Using pandas
housing.plot.box(column="median_house_value", by="ocean_proximity")
plt.show()

# Using seaborn
plt.figure(figsize=(9, 5))
sns.boxplot(housing, x="ocean_proximity", y="median_house_value", hue="median_age_cat", fill=False, gap=.2)
plt.show()

두 연속 변수간의 관계

housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_household"] = housing["total_bedrooms"] / housing["households"]
housing["people_per_household"] = housing["population"] / housing["households"]

또는 assign()를 사용

housing.assign(
    rooms_per_household = lambda x: x["total_rooms"] / x["households"],
    bedrooms_per_household = lambda x: x["total_bedrooms"] / x["households"],
    people_per_household = lambda x: x["population"] / x["households"]
).head(1)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88               41.00       880.00          129.00   

   population  households  median_income  median_house_value ocean_proximity  \
0      322.00      126.00           8.33           452600.00        NEAR BAY   

  median_age_cat  rooms_per_household  bedrooms_per_household  \
0          40-52                 6.98                    1.02   

   people_per_household  
0                  2.56

housing.value_counts("median_house_value").sort_index()

median_house_value
14999.00       4
17500.00       1
22500.00       4
            ... 
499100.00      1
500000.00     27
500001.00    965
Name: count, Length: 3842, dtype: int64

# median_house_value < 500001 값으로 필터링
housing = housing.query('median_house_value < 500001')

xvar = "rooms_per_household"
yvar = "median_house_value"

# matplotlib의 객체 방식
fig, ax = plt.subplots()
housing.plot.scatter(ax=ax, x=xvar, y=yvar, alpha=0.1, figsize=(7, 5))

# fitted line of natural spline: 아래 노트 참고
nspline_fit = nspline(housing, xvar, yvar, df_n=15).sort_values(xvar)
nspline_fit.plot.line(ax=ax, x=xvar, y=yvar, c=".3", figsize=(7, 5))

plt.xlim(0, 10)
plt.ylim(0, 500000)
plt.show()

Natural spline fit

def nspline(df, x, y, df_n=5):
    from statsmodels.formula.api import ols

    df = df[[x, y]].dropna()
    formula = f"{y} ~ cr({x}, df={df_n})"
    df[y] = ols(formula, data=df).fit().fittedvalues

    return df

해변에 가까운 정도(ocean_proximity) 따라 나누어 보면,

# divide plots by ocean_proximity
fig, ax = plt.subplots(1, 4, figsize=(12, 3))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

types = ['NEAR OCEAN', '<1H OCEAN', 'NEAR BAY', 'INLAND']
for i, op in enumerate(types):

    df = housing.query(f'ocean_proximity == "{op}"')
    df.plot.scatter(ax=ax[i], x=xvar, y=yvar, alpha=0.1)

    # fitted line of natural spline
    nspline_fit = nspline(df, xvar, yvar, df_n=15).sort_values(xvar)
    nspline_fit.plot.line(ax=ax[i], x=xvar, y=yvar, c=".3")
    
    ax[i].set_title(op)
    ax[i].set_xlim(1, 12)
    ax[i].set_ylim(0, 500000)

plt.show()

seaborn.object 방식

(
    so.Plot(housing, x='rooms_per_household', y='median_house_value')
    .add(so.Dots(alpha=.1))
    .add(so.Line(color=".3"), so.PolyFit(5))  # polynomial fit of degree 5
    .facet('ocean_proximity')
    .limit(x=(1, 12), y=(0, 500000))
    .layout(size=(8.9, 3))
)

범주형 변수의 순서 할당

pd.Categorical을 사용하여 범주형 변수의 순서를 지정할 수 있음.

housing["ocean_proximity"] = pd.Categorical(
    housing["ocean_proximity"],
    categories=["NEAR BAY", "NEAR OCEAN", "<1H OCEAN", "INLAND"],
    ordered=True,
)

해변에 가까운 정도(ocean_proximity)와 집의 연령(median_age_cat)에 따라 나누어 보면,

(
    so.Plot(
        housing.query('ocean_proximity != "ISLAND"'),
        x="rooms_per_household",
        y="median_house_value",
    )
    .add(so.Dots(alpha=0.1))
    .add(so.Line(color=".3"), so.PolyFit(5))  # polynomial fit of degree 5
    .facet(col="ocean_proximity", row="median_age_cat")
    .limit(x=(1, 12), y=(0, 500000))
    .layout(size=(8, 8))
)

해안에 가까운 정도(ocean_proximity)가 고정되어 있을 때, 그 안에서 여전히
경도(longitude)가 작을수록, 집값(median_house_value)이 변화하는지 살펴보면,

(
    so.Plot(
        housing.query('ocean_proximity != "ISLAND"'),
        x='longitude',
        y='median_house_value')
    .add(so.Dots(alpha=.1))
    .add(so.Line(color=".3"), so.PolyFit(5))  # polynomial fit of degree 5
    .facet("ocean_proximity")
    .layout(size=(8.9, 3))
    .share(x=False)
)

Population과의 관계가 있을까?

Show the code

housing2 = housing.copy()
housing2["ocean_proximity"] = (
    housing
    .query('ocean_proximity not in ["ISLAND", "INLAND"]')["ocean_proximity"]
    .cat.remove_unused_categories()
)

(
    so.Plot(housing2, x='longitude', y='population')
    .add(so.Dots(alpha=.1))
    .add(so.Line(color=".3"), so.PolyFit(5))  # polynomial fit of degree 5
    .facet("ocean_proximity")
    .layout(size=(8.9, 3))
    .share(x=False)
    .limit(y=(0, 3000))
)

Panelized spline fit: pyGAM 참고

Telco Customer Churn

Data: Telco Customer Churn

churn = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

pd.options.display.max_columns = 30
churn.head(3)

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1              Yes          No          No              No        One year   
2               No          No          No              No  Month-to-month   

  PaperlessBilling     PaymentMethod  MonthlyCharges TotalCharges Churn  
0              Yes  Electronic check           29.85        29.85    No  
1               No      Mailed check           56.95       1889.5    No  
2              Yes      Mailed check           53.85       108.15   Yes

plt.figure(figsize=(7, 3))  # 생략 가능
sns.boxplot(data=churn, x="StreamingTV", y="tenure", hue="Churn", width=.7, gap=.2, fill=False)
plt.show()

churn_count = churn.groupby(["InternetService", "Churn"]).size().reset_index(name="count")
churn_count

  InternetService Churn  count
0             DSL    No   1962
1             DSL   Yes    459
2     Fiber optic    No   1799
3     Fiber optic   Yes   1297
4              No    No   1413
5              No   Yes    113

plt.figure(figsize=(7, 3))
sns.barplot(churn_count, y="InternetService", x="count", hue="Churn")
plt.show()

churn_perc = pd.crosstab(churn["InternetService"], churn["Churn"], normalize="index")
churn_perc

Churn             No  Yes
InternetService          
DSL             0.81 0.19
Fiber optic     0.58 0.42
No              0.93 0.07

churn_perc.plot(kind="barh", stacked=True, figsize=(7, 3))
plt.show()

(
    so.Plot(churn, y='InternetService', color='Churn')
    .add(so.Bar(), so.Hist("proportion", common_norm=["y"]), so.Stack())
    .layout(size=(7, 2))
    .scale(color="Set1")
)

plt.figure(figsize=(10, 5))
sns.countplot(churn, x="tenure", hue="Churn")

plt.xticks(np.arange(0, 72, 5))  # 5달 간격으로 눈금 표시
plt.xticks(rotation=45)  # x축 눈금 텍스트 45도 회전
plt.show()

(
    so.Plot(churn, x='tenure', color = 'Churn')
    .add(so.Bar(), so.Count(), so.Stack())
    .layout(size=(8, 5))
    .scale(color="Set1")
)

# Yes, No를 각각 1, 0으로 변환해서 Churn_n에 저장
churn["Churn_n"] = churn["Churn"].map({"Yes": 1, "No": 0})

(
    so.Plot(churn, x='tenure', y='Churn_n')
    .add(so.Dot(alpha=.05), so.Jitter(x=1, y=.05))
    .add(so.Line(), so.PolyFit(5))
    .scale(color="Set1")
)

인터넷 서비스 사용여부(InternetService)에 따른 고객 이탈여부(Churn)의 비율이 다른가?

(
    so.Plot(churn, x='tenure', y='Churn_n', color="InternetService")
    .add(so.Dot(alpha=.05), so.Jitter(x=1, y=.05))
    .add(so.Line(), so.PolyFit(5))
    .scale(color="Set1")
)

df = churn.groupby(["tenure", "InternetService"])["Churn_n"].mean().reset_index()
(
    so.Plot(df, x='tenure', y='Churn_n', color="InternetService")
    .add(so.Dot())
    .add(so.Line(), so.PolyFit(2))
    .label(y="Churn rate")
    .scale(color="Set1")
)

Logit 값으로 변환한 값을 살펴보면; \(\displaystyle logit(p) = \log\left(\frac{p}{1-p}\right)\)

df["logit"] = np.log((df["Churn_n"] + 0.01) / (1 - df["Churn_n"] - 0.01))

(
    so.Plot(df, x='tenure', y='logit', color="InternetService")
    .add(so.Dot())
    .add(so.Line(), so.PolyFit(2))
    .label(y="Churn rate")
    .scale(color="Set1")
)

한글 폰트 설정

import matplotlib.font_manager as fm

# 현재 시스템에 있는 폰트 이름 확인
sorted([f.name for f in fm.fontManager.ttflist])

# 폰트 설정
plt.rc('font', family='AppleGothic')

정리

Matplotlib

시각화 대상(aesthetic)에 값을 직접 입력; Series나 NumPy array
두 가지 interface 제공: MATLAB 스타일, 객체 스타일
디테일한 조정이 가능

참고: Python Data Science Handbook/4. Visualization with Matplotlib

pandas/seaborn

시각화 대상(aesthetic)에 변수 이름을 할당; 직접 입력해도 됨
Matplotlib을 기반이므로, matplotlib의 여러 기능을 함께 사용할 수 있음
편리한 시각화 함수들 제공
대신, 많은 함수들과 기능, 다양한 파라미터로 인해 혼동이 있을 수 있음

pandas 참고: Python for Data Analysis/9.2 Plotting with pandas and seaborn
seaborn 참고: Seaborn tutorial

seaborn.objects

체계적인 시각화 원리를 기반으로 디자인되었음
사용자의 요구에 맞게 자유롭게 응용하여 시각화를 구성할 수 있음
처음에 원리를 익히는데 시간이 다소 걸릴 수 있음
아직 2% 부족한 부분이 있음

참고: The seaborn.objects interface