import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 顏色
color = sns.color_palette()
print(color)
# 數據精度
pd.set_option('precision', 3)
[(0.8862745098039215, 0.2901960784313726, 0.2), (0.20392156862745098, 0.5411764705882353, 0.7411764705882353), (0.596078431372549, 0.5568627450980392, 0.8352941176470589), (0.4666666666666667, 0.4666666666666667, 0.4666666666666667), (0.984313725490196, 0.7568627450980392, 0.3686274509803922), (0.5568627450980392, 0.7294117647058823, 0.25882352941176473), (1.0, 0.7098039215686275, 0.7215686274509804)]
df=pd.read_csv('winequality-red.csv',sep = ';')
df.head()
# 字段含義
#"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
# “固定酸度”;“揮發性酸度”; “檸檬酸”; “殘糖”; 氯化物”;“游離二氧化硫”; “總二氧化硫”; “密度”;“pH”;“硫酸鹽”;“酒精”;“質量”
df.info()
df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity 1599 non-null float64
volatile acidity 1599 non-null float64
citric acid 1599 non-null float64
residual sugar 1599 non-null float64
chlorides 1599 non-null float64
free sulfur dioxide 1599 non-null float64
total sulfur dioxide 1599 non-null float64
density 1599 non-null float64
pH 1599 non-null float64
sulphates 1599 non-null float64
alcohol 1599 non-null float64
quality 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
# 獲取所有的自帶樣式
print(plt.style.available)
# 使用plt自帶的樣式美化
plt.style.use('ggplot')
['bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark-palette', 'seaborn-dark', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'seaborn', 'Solarize_Light2', 'tableau-colorblind10', '_classic_test']
# 獲取每個字段
# 方法1
colnm = df.columns.to_list()
print(colnm)
print(len(colnm))
# 方法2
print()
print(list(df))
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
12
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
# 繪制箱線圖1
fig = plt.figure(figsize=(15,9))
for i in range(12):
plt.subplot(3,4,i+1) # 三行四列 位置是i+1的子圖
# orient:"v"|"h" 用於控制圖像使水平還是豎直顯示(這通常是從輸入變量的dtype推斷出來的,此參數一般當不傳入x、y,只傳入data的時候使用)
sns.boxplot(df[colnm[i]], orient="v", width = 0.3, color = color[0])
plt.ylabel(colnm[i],fontsize = 13)
# plt.xlabel('one_pic')
# 圖形調整
plt.subplots_adjust(left=0.2, wspace=0.8, top=0.9, hspace=0.1) # 子圖的左側 子圖之間的寬度間隔 子圖的高 子圖之間的高度間隔
# tight_layout會自動調整子圖參數,使之填充整個圖像區域
plt.tight_layout()
print('箱線圖')
箱線圖
# 繪制直方圖
fig = plt.figure(figsize=(15, 9))
for i in range(12):
plt.subplot(3,4,i+1) # 3行4列 位置是i+1的子圖
df[colnm[i]].hist(bins=80, color=color[1]) # bins 指定顯示多少豎條
plt.xlabel(colnm[i], fontsize=13)
plt.ylabel('Frequency')
# tight_layout會自動調整子圖參數,使之填充整個圖像區域
plt.tight_layout()
# plt.savefig('hist.png')
print('直方圖')
直方圖
根據箱線圖和直方圖,這個數據集主要研究紅酒品質和理化性質之間的關系,品質質量評價范圍是0-10,這個數據集的評價范圍是3-8,其中82%的品質是5和6.
“fixed acidity”;“volatile acidity”;“citric acid”;“free sulfur dioxide” ;total sulfur dioxide; “sulphates”; PH
“固定酸度”; “揮發性酸度”; “檸檬酸”; “游離二氧化硫” “總二氧化硫”; “硫酸鹽”;
這個數據集總共有七個和酸度有關系的;前六個特征都是與酸度ph有關系的, pH是在對數的尺度,下面對前6個特征取對數然後作histogram。另外,pH值主要是與fixed acidity有關,fixed acidity比volatile acidity和citric acid高1到2個數量級(Figure 4),比free sulfur dioxide, total sulfur dioxide, sulphates高3個數量級。一個新特征total acid來自於前三個特征的和。
acidityFeat = ['fixed acidity', 'volatile acidity', 'citric acid',
'free sulfur dioxide', 'total sulfur dioxide', 'sulphates']
fig = plt.figure(figsize=(15, 9))
for i in range(6):
plt.subplot(2,3,i+1)
v = np.log10(np.clip(df[acidityFeat[i]].values, a_min = 0.001, a_max = None)) # clip這個函數將將數組中的元素限制在a_min, a_max之間,大於a_max的就使得它等於 a_max,小於a_min,的就使得它等於a_min
plt.hist(v, bins = 50, color = color[2])
plt.xlabel('log(' + acidityFeat[i] + ')',fontsize = 12)
plt.ylabel('Frequency')
plt.tight_layout()
print('\nFigure 3: Acidity Features in log10 Scale')
Figure 3: Acidity Features in log10 Scale
plt.figure(figsize=(10,6))
# print(np.linspace(-2, 2))
bins = 10**(np.linspace(-2, 2)) # 間隔采樣 默認stop=True 可以取到最後
# bins= 20
plt.hist(df['fixed acidity'], bins = bins, edgecolor = 'k', label = 'Fixed Acidity') # edgecolor 直方圖邊框顏色
plt.hist(df['volatile acidity'], bins = bins, edgecolor = 'black', label = 'Volatile Acidity')
plt.hist(df['citric acid'], bins = bins, edgecolor = 'red', alpha = 0.8, label = 'Citric Acid')
plt.xscale('log') # 把當前的圖形x軸設置為對數坐標。
plt.xlabel('Acid Concentration (g/dm^3)')
plt.ylabel('Frequency')
plt.title('Histogram of Acid Concentration')
plt.legend()
plt.tight_layout()
print('Figure 4')
Figure 4
# 總酸度
df['total acid'] = df['fixed acidity'] + df['volatile acidity'] + df['citric acid']
# print(df)
plt.figure(figsize = (8,5))
plt.subplot(121) # # 第一張圖中中圖片排列方式為1行2列第一張圖
plt.hist(df['total acid'], bins = 50, color = color[4])
plt.xlabel('total acid')
plt.ylabel('Frequency')
plt.subplot(122)
plt.hist(np.log(df['total acid']), bins = 80 , color = color[5])
plt.xlabel('log(total acid)')
plt.ylabel('Frequency')
plt.tight_layout()
print("Figure 5: Total Acid Histogram")
# 不設置plt.subplot 的話就是一張圖了
# plt.hist(df['total acid'], bins = 50, color = color[4])
# plt.xlabel('total acid')
# plt.ylabel('Frequency')
# plt.hist(np.log(df['total acid']), bins = 80 , color = color[5])
# plt.xlabel('log(total acid)')
# plt.ylabel('Frequency')
Figure 5: Total Acid Histogram
Residual sugar “殘糖” 與酒的甜度相關,通常用來區別各種紅酒,干紅(<=4 g/L), 半干(4-12 g/L),半甜(12-45 g/L),和甜(>45 g/L)。 這個數據中,主要為干紅,沒有甜葡萄酒。
# 構建新的dataframe ['Residual sugar'] 0,4 dry 4,12 medium dry 12,45 semi-sweet
df['sweetness'] = pd.cut(df['residual sugar'], bins = [0, 4, 12, 45],
labels=["dry", "medium dry", "semi-sweet"])
# print(df.head(10))
print()
print(df['sweetness'].value_counts())
dry 1474
medium dry 117
semi-sweet 8
Name: sweetness, dtype: int64
plt.figure(figsize = (8,5))
df['sweetness'].value_counts().plot(kind='bar', color=color)
plt.xticks(rotation=45)
plt.xlabel('sweetness', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)
plt.tight_layout()
print("Figure 6: Sweetness")
Figure 6: Sweetness
下面Figure 7和8分別顯示了紅酒理化特征和品質的關系。其中可以看出的趨勢有:
品質好的酒有更高的檸檬酸,硫酸鹽,和酒精度數。硫酸鹽(硫酸鈣)的加入通常是調整酒的酸度的。其中酒精度數和品質的相關性最高。
品質好的酒有較低的揮發性酸類,密度,和pH。
殘留糖分,氯離子,二氧化硫似乎對酒的品質影響不大。
# set_style 有五種預設的seaborn主題:暗網格(darkgrid),白網格(whitegrid),全黑(dark),全白(white),全刻度(ticks)。
# 樣式控制 set_style(), set_context()會設置matplotlib的默認參數。
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.1)
# s = df.columns.tolist()
# print(s)
# colnm = df.columns.tolist()[:11] + ['total acid']
# print(colnm)
# 獲取指定的列
colnm = df.columns.to_list()[:11] + ['total acid']
# print(df)
# print(colnm)
# final_df = df[colnm]
# print(final_df)
plt.figure(figsize = (10, 8))
for i in range(12):
plt.subplot(4,3,i+1)
sns.boxplot(x ='quality', y = colnm[i], data = df, color = color[1], width = 0.6)
plt.ylabel(colnm[i],fontsize = 12)
plt.tight_layout()
print("\nFigure 7: Physicochemical Properties and Wine Quality by Boxplot")
Figure 7: Physicochemical Properties and Wine Quality by Boxplot
sns.set_style("dark")
plt.figure(figsize = (10,8))
colnm = df.columns.to_list()[:11] + ['total acid', 'quality']
# 不滿足連續數據,正態分布,線性關系,用spearman相關系數是最恰,當兩個定序測量數據之間也用spearman相關系數
# pearson:Pearson相關系數來衡量兩個數據集合是否在一條線上面,即針對線性數據的相關系數計算,針對非線性數據便會有誤差。
# kendall:用於反映分類變量相關性的指標,即針對無序序列的相關系數,非正太分布的數據
# spearman:非線性的,非正太分析的數據的相關系數
# mcorr = df[colnm].corr(method='spearman')
# 如果不是數字 get_dummies one_hot 編碼之後 計算相關系數
mcorr = df[colnm].corr()
# print(mcorr)
# zeros_like函數主要是想實現構造一個矩陣W_update,其維度與矩陣W一致,並為其初始化為全0;這個函數方便的構造了新矩陣,無需參數指定shape大小
# mask = np.zeros_like(mcorr, dtype=None) # 0 0 0 0
mask = np.zeros_like(mcorr, dtype=np.bool)
# print(mask)
mask[np.triu_indices_from(mask)] = True # 1
# print(mask)
# 調色盤 對圖表整體顏色、比例進行風格設置,包括顏色色板等 調用系統風格進行數據可視化
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# 熱力圖
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')
print("\nFigure 8: Pairwise Correlation Plot")
Figure 8: Pairwise Correlation Plot
密度和酒精濃度是相關的,物理上,兩者並不是線性關系。Figure 9展示了兩者的關系。另外密度還與酒中其他物質的含量有關,但是關系很小。
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.4)
# plot figure
plt.figure(figsize = (6,4))
# scatter_kws 設置點的大小 density
sns.regplot(x='density', y = 'alcohol', data = df, scatter_kws = {
's':15}, color = color[6])
# 設置y軸刻度
plt.xlim(0.989, 1.005)
plt.ylim(7,16)
print('Figure 9: Density vs Alcohol')
Figure 9: Density vs Alcohol
由圖10可以看出來 密度與固定酸度和酒精的相關性最好
otherFeat = ['fixed acidity', 'volatile acidity', 'citric acid',"chlorides",
'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol']
fig = plt.figure(figsize=(15, 9))
for i in range(8):
plt.subplot(3,4,i+1)
sns.regplot(x='density', y = otherFeat[i], data = df, scatter_kws = {
's':15}, color = color[2])
plt.tight_layout()
print('Figure 10: Density vs Other')
Figure 10: Density vs Other
pH和非揮發性酸性物質有-0.683的相關性。因為非揮發性酸性物質的含量遠遠高於其他酸性物質,總酸性物質(total acidity)這個特征並沒有太多意義。
acidity_related = ['fixed acidity', 'volatile acidity', 'total sulfur dioxide',
'sulphates', 'total acid']
plt.figure(figsize = (10,6))
for i in range(5):
plt.subplot(2,3,i+1)
sns.regplot(x='pH', y = acidity_related[i], data = df, scatter_kws = {
's':10}, color = color[6])
plt.tight_layout()
print("Figure 11: pH vs acid")
Figure 11: pH vs acid
與品質相關性最高的三個特征是酒精濃度,揮發性酸度,和檸檬酸。下面圖中顯示的酒精濃度,揮發性酸和品質的關系。
酒精濃度,揮發性酸和品質:
對於好酒(7,8)以及差酒(3,4),關系很明顯。但是對於中等酒(5,6),酒精濃度的揮發性酸度有很大程度的交叉。
根據下圖可以得到 質量較好的酒含的酒精量較高, 質量不好的酒的揮發性酸較高。
plt.style.use('ggplot') # 樣式美化
# 繪制回歸模型
# lmplot hue, col, row #定義數據子集的變量,並在不同的圖像子集中繪制
# col_wrap: int, #設置每行子圖數量 order: int, optional #多項式回歸,設定指數 markers: 定義散點的圖標
sns.lmplot(x = 'alcohol', y = 'volatile acidity', hue = 'quality',
data = df, fit_reg = True, scatter_kws={
's':10}, height = 5)
print("Figure 12-1: Scatter Plots of Alcohol, Volatile Acid and Quality")
plt.show()
sns.lmplot(x = 'alcohol', y = 'volatile acidity', hue = 'quality',
data = df, fit_reg = False, scatter_kws={
's':10}, height = 5)
print("Figure 12-2: Scatter Plots of Alcohol, Volatile Acid and Quality")
plt.show()
Figure 12-1: Scatter Plots of Alcohol, Volatile Acid and Quality
Figure 12-2: Scatter Plots of Alcohol, Volatile Acid and Quality
# hue, col, row #定義數據子集的變量,並在不同的圖像子集中繪制 col 列表示的元素 顯示格式:col=1
# col_wrap: int, #設置每行子圖數量,即限制列 order: int, optional #多項式回歸,設定指數 markers: 定義散點的圖標
sns.lmplot(x = 'alcohol', y = 'volatile acidity', col='quality', hue = 'quality',
data = df,fit_reg = False, height = 3, aspect = 0.8, col_wrap=3,
scatter_kws={
's':20})
print("Figure 12-3: Scatter Plots of Alcohol, Volatile Acid and Quality")
print()
plt.show()
sns.lmplot(x = 'alcohol', y = 'volatile acidity', col='quality', hue = 'quality',
data = df,fit_reg = True, height = 3, aspect = 0.9, col_wrap=3,
scatter_kws={
's':20})
print("Figure 12-4: Scatter Plots of Alcohol, Volatile Acid and Quality")
Figure 12-3: Scatter Plots of Alcohol, Volatile Acid and Quality
Figure 12-4: Scatter Plots of Alcohol, Volatile Acid and Quality
pH和非揮發性的酸以及檸檬酸有相關性。整體趨勢也很合理,濃度越高,pH越低,酒越酸。
# style
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.4)
plt.figure(figsize=(6,5))
#get_cmap中取值可為:Possible values are: Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r,
# CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r,
# PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r,
# PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn,
# RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Wistia, Wistia_r, YlGn,
# YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r...其中末尾加r是顏色取反。
cm = plt.cm.get_cmap('RdBu')
sc = plt.scatter(df['fixed acidity'], df['citric acid'], c=df['pH'], vmin=2.6, vmax=4, s=15, cmap=cm)
bar = plt.colorbar(sc)
bar.set_label('pH', rotation = 0)
plt.xlabel('fixed acidity')
plt.ylabel('citric acid')
plt.xlim(4,18)
plt.ylim(0,1)
print('Figure 12-1: pH with Fixed Acidity and Citric Acid')
Figure 12: pH with Fixed Acidity and Citric Acid
cm = plt.cm.get_cmap('YlOrRd')
sc = plt.scatter(x=df['fixed acidity'], y=df['citric acid'], c=df['pH'], vmin=2.6, vmax=4, s=15, cmap=cm)
bar = plt.colorbar(sc)
bar.set_label('pH', rotation = 0)
plt.xlabel('fixed acidity')
plt.ylabel('citric acid')
plt.xlim(4,18)
plt.ylim(0,1)
print('Figure 12-2: pH with Fixed Acidity and Citric Acid')
Figure 12-2: pH with Fixed Acidity and Citric Acid
參考:阿裡天池