您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Pandas data analysis 03 - pandas index settings and operations (full usage analysis)

編輯：Python

參考書目：《深入淺出Pandas：利用Python進行數據處理與分析》

pandas的數據結構和excel表類似,There will be ranks,and their respective names,These rows and columns are the indexes,Data can be found quickly by indexing.

建立索引

import numpy as np
import pandas as pd
import datetime
data = 'https://www.gairuo.com/file/data/dataset/team.xlsx'
df = pd.read_excel(data, index_col='name') # When reading, set the index to name
#Or read first,再設置
df=pd.read_excel(data)
df.set_index('name',inplace=True)

多重索引

#多重索引
df.set_index('name','team') # Set month as index
df.set_index([df.name.str[0],'name']) # The first letter of the name and the name are set as the index

序列索引

#序列索引
s = pd.Series([1, 2, 3, 4])
df.set_index(s) # 指定一個索引
df.set_index([s, 'name']) # The specified index is specified at the same time as the existing field
df.set_index([s, s**2]) # 計算索引

其他的參數

# 其他的參數
df.set_index('month', drop=False) # 保留原列
df.set_index('month', append=True) # 保留原來的索引
df.set_index('month', inplace=True) # Index and rewrite coverage df

重置索引

#重置索引
df.reset_index() # Clear the index
df.set_index('month').reset_index() # 相當於啥也沒干
# 刪除原索引,month column gone
df.set_index('month').reset_index(drop=True)
df2.reset_index(inplace=True) # Override takes effect
# year The primary index is cancelled
df.set_index(['month', 'year']).reset_index(level=1)
df2.reset_index(level='class') # Ditto using hierarchical index names
df.reset_index(level='class', col_level=1) # 列索引
# Fill in the specified name does not exist for the level name
df.reset_index(level='class', col_level=1, col_fill='species')

索引類型

索引類型
# RangeIndex: Immutable index of the single-adjustment range.
# Int64Index: int64類型,Immutability of sorted sliceable collections ndarray.
# UInt64Index: Unsigned integer labels
# Float64Index: Float64 類型
# CategoricalIndex 類別索引

pd.RangeIndex(1,100,2)
# RangeIndex(start=1, stop=100, step=2)
pd.Index([1, 2, 3])
# Int64Index([1, 2, 3], dtype='int64')
pd.Int64Index([1,2,3,-4], name='num') # v2.0 將棄用
# Int64Index([1, 2, 3, -4], dtype='int64', name='num')
pd.UInt64Index([1,2,3,4]) # v2.0 將棄用
# UInt64Index([1, 2, 3, 4], dtype='uint64')
pd.Float64Index([1.2,2.3,3,4]) # v2.0 將棄用
# Float64Index([1.2, 2.3, 3.0, 4.0], dtype='float64')
pd.CategoricalIndex(['a', 'b', 'a', 'b']) #類別索引
# CategoricalIndex(['a', 'b', 'a', 'b'], categories=['a', 'b'], ordered=False, dtype='category')

#間隔索引

#間隔索引
pd.interval_range(start=0, end=5,closed='left') ##IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], 默認closed='right', dtype='interval[int64]')

#多層索引

#多層索引
arrays = [[1, 1, 2, 2], ['red', 'yellow', 'red', 'blue']]
pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) #MultiIndex([(1, 'red'),(1, 'blue'),(2, 'red'),(2, 'blue')],names=['number', 'color'])

日期

# Consecutive from one date to another
pd.date_range(start='1/1/2018', end='1/08/2018')
# Specify the start time and period
pd.date_range(start='1/1/2018', periods=8)
#周期索引
# 以月為周期
pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
# Cycle nesting
pd.period_range(start=pd.Period('2017Q1', freq='Q'),end=pd.Period('2017Q2', freq='Q'), freq='M')
#PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], dtype='period[M]', freq='M')
#秒為周期
t = pd.period_range('2020-5-1 10:00:05', periods=8, freq='S')
pd.PeriodIndex(t,freq='S')

#Time difference index
pd.TimedeltaIndex(data =['06:05:01.000030', '+23:59:59.999999',
'22 day 2 min 3us 10ns', '+23:29:59.999999','+12:19:59.999999'])
# 使用 datetime
pd.TimedeltaIndex(['1 days', '1 days, 00:00:05',
np.timedelta64(2, 'D'), datetime.timedelta(days=2, seconds=2)])

索引對象

#創建索引
pd.Index([1, 2, 3]) # Int64Index([1, 2, 3], dtype='int64')
pd.Index(list('abc')) # Index(['a', 'b', 'c'], dtype='object')
pd.Index(['e', 'd', 'a', 'b'], name='something') # 指定索引名稱 name

df.index # RangeIndex(start=0, stop=4, step=1)
df.columns # Index(['month', 'year', 'sale'], dtype='object')

索引屬性

#The following method also works df.columns, 因為都是 index 對象：
df.index.name # 名稱
df.index.array # array 數組
df.index.dtype # 數據類型
df.index.shape # 形狀
df.index.size # 元素數量
df.index.values # array 數組
# 其他,不常用
df.index.empty # 是否為空
df.index.is_unique # Does it not repeat
df.index.names # 名稱列表
df.index.is_all_dates # Is it all datetime
df.index.has_duplicates # 是否有重復值
df.index.values # 索引的值 array

索引操作

#The following method also works df.columns
f.index.astype('int64') # 轉換類型
df.index.isin() # 是否存在,見下方示例
df.index.rename('number') # 修改索引名稱
df.index.nunique() # 不重復值的數量
df.index.sort_values(ascending=False,) # 排序,倒序
df.index.map(lambda x:x+'_') # map 函數處理
df.index.str.replace('_', '') # str 替換
df.index.str.split('_') # 分隔
df.index.to_list() # 轉為列表
df.index.to_frame(index=False, name='a') # 轉成 DataFrame
df.index.to_series() # 轉 series
df.index.to_numpy() # 轉為 numpy
df.index.unique() # 去重
df.index.value_counts() # Weight and quantity
df.index.where(df.index=='a') # 篩選
df.index.rename('grade', inplace=False) # Rename the index name
df.index.rename(['species', 'year']) # 多層,Rename the index name
df.index.max() # 最大值
df.index.argmax() # 最大索引值
df.index.any() #The function checks if any element in the index is true
df.index.all()
df.index.T # 轉置,Useful in multi-level indexes

# 其他,不常用
df.index.append(pd.Index([4,5])) # 追加
df.index.repeat(2) # 重復幾次
df.index.inferred_type # Inferred data type
df.index.hasnans # 有沒有空值
df.index.is_monotonic_decreasing # 是否單調遞減
df.index.is_monotonic # 是否單調遞增
df.index.is_monotonic_increasing # 是否單調遞增
df.index.nbytes # The number of bytes in the underlying data
df.index.ndim # 維度數,維數
df.index.nlevels # The number of index levels,通常為 1
df.index.min() # 最小值
df.index.argmin() # 最小索引值
df.index.argsort() # An array of sequential values
df.index.asof(2) # Returns the most recent index
# numpy dtype or pandas type
df.index.astype('int64', copy=True) # 深拷貝
# 拷貝
df.index.copy(name='new', deep=True, dtype='int64')
df.index.delete(1) # 刪除指定位置
# 對比不同
df.index.difference(pd.Index([1,2,4]), sort=False)
df.index.drop('a', errors='ignore') # 刪除
df.index.drop_duplicates(keep='first') # 去重值
df.index.droplevel(0) # Delete hierarchy
df.index.dropna(how='all') # 刪除空值
df.index.duplicated(keep='first') # The duplicate values are in the result array as True
df.index.equals(df.index) # Is it the same as another index object
df.index.factorize() # 分解成（array:0-n, Index）
df.index.fillna(0, {0:'nan'}) # 填充空值
# 字符列表, 把 name The value is added first, 每個值加10
df.index.format(name=True, formatter=lambda x:x+10)
# 返回一個 array, An indexed bit array of the specified value,不在的為 -1
df.index.get_indexer([2,9])
# 獲取 指定層級 Index 對象
df.index.get_level_values(0)
# 指定索引的位置,見示例
df.index.get_loc('b')
df.index.insert(2, 'f') # at the index bit 2 插入 f
df.index.intersection(df.index) # 交集
df.index.is_(df.index) # 類似 is 檢查
df.index.is_categorical() # Whether to classify the data
df.index.is_type_compatible(df.index) # 類型是否兼容
df.index.is_type_compatible(1) # 類型是否兼容
df.index.isna() # array 是否為空
df.index.isnull() # array whether there are missing values
df.index.join(df.index, how='left') # 連接
df.index.notna() # value that does not exist
df.index.notnull() # value that does not exist
df.index.ravel() # flattened valuendarray
df.index.reindex(['a','b']) # 新索引 (Index,array:0-n)
df.index.searchsorted('f') # If insert this value at which index bit after sorting
df.index.searchsorted([0, 4]) # array([0, 3]) 多個
df.index.set_names('quarter') # 設置索引名稱
df.index.set_names('species', level=0)
df.index.set_names(['kind', 'year'], inplace=True)
df.index.shift(10, freq='D') # The date index is moved forward 10 天
idx1.symmetric_difference(idx2) # The two index different content
idx1.union(idx2) # 拼接
df.add_prefix('t_') # Prefix the header
df.add_suffix('_d') # Add a suffix to the header
df.first_valid_index() # The first index with a value
df.last_valid_index() # The last index with a value
# The new index of the value set by the mask, 小於 10 的變為 0
df.index.putmask(df.index<10, 0)

索引重命名

s.rename_axis("animal") # 索引重命名
df.rename_axis(["dow", "hr"]) # Multi-level index index name modification
df.rename_axis('info', axis="columns") # Modify the row index name
# Modify the multi-level row index name
df.rename_axis(index={'a': 'A', 'b': 'B'})
# Modify the multi-level column index name
df.rename_axis(columns={'name': 's_name', 'b': 'B'})
df.rename(columns={'name': 's_name', 'b': 'B'},inplace=True) #修改列名
df.rename_axis(columns=str.upper) # Row index names are capitalized

修改索引

# One-to-one correspondence to modify the column index
df.rename(columns={"A": "a", "B": "c"})
df.rename(str.lower, axis='columns')
# 修改行索引
df.rename(index={0: "x", 1: "y", 2: "z"})
df.rename({1: 2, 2: 4}, axis='index')
# 修改數據類型
df.rename(index=str)
# Modify the index again
replacements = {l1:l2 for l1, l2 in zip(list1, list2)}
df.rename(replacements)
# Prefix the column name
df.rename(lambda x:'t_' + x, axis=1)
# 利用 iter() 函數的 next 特性修改
df.rename(lambda x, y=iter('abcdef'): next(y), axis=1)
# 修改列名,Generate a dictionary of old and new fields in unpacked form
df.rename(columns=dict(zip(df, list('abcd'))))

使用set_axis修改

# 修改索引
df.set_axis(['a', 'b', 'c'], axis='index')
# 修改列名
df.set_axis(list('abcd'), axis=1)
# 使修改生效
df.set_axis(['a', 'b'], axis='columns', inplace=True)
# Pass in the index content
df.set_axis(pd.Index(list('abcde')), axis=0)

案例

# idx.isin() 是否存在
idx = pd.Index([1,2,3])
df.index.isin(idx)
# array([False, False, False, False])
df.index.isin(['a','b'])
# array([ True, True, False, False])
midx = pd.MultiIndex.from_arrays([[1,2,3],['red', 'blue', 'green']],names=('number', 'color'))
midx.isin([(1, 'red'), (3, 'red')])
# array([ True, False, False])
dates = ['2000-03-11', '2000-03-12', '2000-03-13']
dti = pd.to_datetime(dates)
dti.isin(['2000-03-11'])
# array([ True, False, False])
# i.argsort() 排序
# Integer index by which the index will be sorted,見下文示例
idx = pd.Index(['b', 'a', 'd', 'c'])
order = idx.argsort() # array([1, 0, 3, 2])
idx[order] # Index(['a', 'b', 'c', 'd'], dtype='object')
# i.asof(2) Returns the most recent index, 支持日期,It is possible to find the latest date
# Return the label from the index;如果不存在,returns the previous label
idx2 = pd.Index([1,3,6])
idx2.asof(5) # 3
idx2.asof(6) # 6
idx2.asof(-1) # nan
# index.get_loc 指定索引的位置,見示例
unique_index = pd.Index(list('abc'))
unique_index.get_loc('b') # 1
monotonic_index = pd.Index(list('abbc'))
monotonic_index.get_loc('b') # slice(1, 3, None)
non_monotonic_index = pd.Index(list('abcb'))
non_monotonic_index.get_loc('b')
# array([False, True, False, True], dtype=bool)