# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm
# pandas options
'mode.copy_on_write', True) # pandas 2.0
pd.set_option(= '{:.2f}'.format # pd.reset_option('display.float_format')
pd.options.display.float_format = 7 # max number of rows to display
pd.options.display.max_rows
# NumPy options
= 2, suppress=True) # suppress scientific notation
np.set_printoptions(precision
# For high resolution display
import matplotlib_inline
"retina") matplotlib_inline.backend_inline.set_matplotlib_formats(
Inspecting data
Mixed
Note
Matplotlib/Seaborn 플랏에서 흐릿하게 그려지는 경우: 고해상도 디스플레이에 최적화
import matplotlib_inline
"retina") matplotlib_inline.backend_inline.set_matplotlib_formats(
Useful method
.head()
, .tail()
, .sample()
.info()
, .describe()
,
.value_counts()
,
.sort_values()
, .nlargest()
, .nsmallest()
Data: Tips
일정기간 한 웨이터가 얻은 팁에 대한 데이터
# load a dataset
= sns.load_dataset("tips")
tips tips
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
.. ... ... ... ... ... ... ...
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
3) # 앞 n개 나열, 기본값은 5 tips.head(
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
5) # 무작위로 n개 표본 추출, 기본값은 1 tips.sample(
total_bill tip sex smoker day time size
129 22.82 2.18 Male No Thur Lunch 3
30 9.55 1.45 Male No Sat Dinner 2
234 15.53 3.00 Male Yes Sat Dinner 2
215 12.90 1.10 Female Yes Sat Dinner 2
146 18.64 1.36 Female No Thur Lunch 3
tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 total_bill 244 non-null float64
1 tip 244 non-null float64
2 sex 244 non-null category
3 smoker 244 non-null category
4 day 244 non-null category
5 time 244 non-null category
6 size 244 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
# numerical type만 나열 tips.describe()
total_bill tip size
count 244.00 244.00 244.00
mean 19.79 3.00 2.57
std 8.90 1.38 0.95
... ... ... ...
50% 17.80 2.90 2.00
75% 24.13 3.56 3.00
max 50.81 10.00 6.00
[8 rows x 3 columns]
="all") # all types 나열 tips.describe(include
total_bill tip sex smoker day time size
count 244.00 244.00 244 244 244 244 244.00
unique NaN NaN 2 2 4 2 NaN
top NaN NaN Male No Sat Dinner NaN
... ... ... ... ... ... ... ...
50% 17.80 2.90 NaN NaN NaN NaN 2.00
75% 24.13 3.56 NaN NaN NaN NaN 3.00
max 50.81 10.00 NaN NaN NaN NaN 6.00
[11 rows x 7 columns]
="category") tips.describe(include
sex smoker day time
count 244 244 244 244
unique 2 2 4 2
top Male No Sat Dinner
freq 157 151 87 176
= tips.value_counts("day") # "day" 칼럼에 대한 각 카테고리별 counts
s1 = tips.value_counts("day", sort=False) # default: sort is true
s2 = tips.value_counts("day", ascending=True) # default: ascending is False
s3 = tips.value_counts("day", normalize=True) # 카테고리별 비율
s4 = tips.value_counts(["sex", "smoker"]) # "sex", "smoker" 칼럼에 대한 유니크한 카테고리별 counts s5
Tip
.value_count()
의 결과는 Series이며 그 이름은 ‘count’ 또는 ’proportion’임 (pandas 2.0)
Missing(NA)을 count하지 않으나 dropna=False
을 이용해 나타낼 수 있음
"day", dropna=False) tips.value_counts(
Series에 대해서도 적용되며, DataFrame으로 컬럼을 선택해 적용할 수 있음
"day"].value_counts() # tips["day"]: Series object
tips["sex", "smoker"]].value_counts() tips[[
Data: palmerpenguins
# load a dataset
= sns.load_dataset("penguins")
penguins penguins.head()
species island bill_length_mm bill_depth_mm flipper_length_mm \
0 Adelie Torgersen 39.10 18.70 181.00
1 Adelie Torgersen 39.50 17.40 186.00
2 Adelie Torgersen 40.30 18.00 195.00
3 Adelie Torgersen NaN NaN NaN
4 Adelie Torgersen 36.70 19.30 193.00
body_mass_g sex
0 3750.00 Male
1 3800.00 Female
2 3250.00 Female
3 NaN NaN
4 3450.00 Female
penguins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 species 344 non-null object
1 island 344 non-null object
2 bill_length_mm 342 non-null float64
3 bill_depth_mm 342 non-null float64
4 flipper_length_mm 342 non-null float64
5 body_mass_g 342 non-null float64
6 sex 333 non-null object
dtypes: float64(4), object(3)
memory usage: 18.9+ KB
="object") penguins.describe(include
species island sex
count 344 344 333
unique 3 3 2
top Adelie Biscoe Male
freq 152 168 168
"island", "species"]) penguins.value_counts([
island species
Biscoe Gentoo 124
Dream Chinstrap 68
Adelie 56
Torgersen Adelie 52
Biscoe Adelie 44
Name: count, dtype: int64
"sex", "species"], dropna=False) # NA은 기본적으로 생략 penguins.value_counts([
sex species
Female Adelie 73
Male Adelie 73
Gentoo 61
..
Chinstrap 34
NaN Adelie 6
Gentoo 5
Name: count, Length: 8, dtype: int64
"tip", ascending=False) tips.sort_values(
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
23 39.42 7.58 Male No Sat Dinner 4
.. ... ... ... ... ... ... ...
111 7.25 1.00 Female No Sat Dinner 1
67 3.07 1.00 Female Yes Sat Dinner 1
92 5.75 1.00 Female Yes Fri Dinner 2
[244 rows x 7 columns]
"size", "tip"], ascending=[False, True]) tips.sort_values([
total_bill tip sex smoker day time size
125 29.80 4.20 Female No Thur Lunch 6
143 27.05 5.00 Female No Thur Lunch 6
156 48.17 5.00 Male No Sun Dinner 6
.. ... ... ... ... ... ... ...
111 7.25 1.00 Female No Sat Dinner 1
82 10.07 1.83 Female No Thur Lunch 1
222 8.58 1.92 Male Yes Fri Lunch 1
[244 rows x 7 columns]
3, "tip") # 다수의 동등 순위가 있을 때 처리: keep="first", "last", "all" tips.nlargest(
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
23 39.42 7.58 Male No Sat Dinner 4