參考書目:《深入淺出Pandas:利用Python進行數據處理與分析》
pandas的數據結構和excel表類似,There will be ranks,and their respective names,These rows and columns are the indexes,Data can be found quickly by indexing.
import numpy as np
import pandas as pd
import datetime
data = 'https://www.gairuo.com/file/data/dataset/team.xlsx'
df = pd.read_excel(data, index_col='name') # When reading, set the index to name
#Or read first,再設置
df=pd.read_excel(data)
df.set_index('name',inplace=True)
多重索引
#多重索引
df.set_index('name','team') # Set month as index
df.set_index([df.name.str[0],'name']) # The first letter of the name and the name are set as the index
序列索引
#序列索引
s = pd.Series([1, 2, 3, 4])
df.set_index(s) # 指定一個索引
df.set_index([s, 'name']) # The specified index is specified at the same time as the existing field
df.set_index([s, s**2]) # 計算索引
其他的參數
# 其他的參數
df.set_index('month', drop=False) # 保留原列
df.set_index('month', append=True) # 保留原來的索引
df.set_index('month', inplace=True) # Index and rewrite coverage df
#重置索引
df.reset_index() # Clear the index
df.set_index('month').reset_index() # 相當於啥也沒干
# 刪除原索引,month column gone
df.set_index('month').reset_index(drop=True)
df2.reset_index(inplace=True) # Override takes effect
# year The primary index is cancelled
df.set_index(['month', 'year']).reset_index(level=1)
df2.reset_index(level='class') # Ditto using hierarchical index names
df.reset_index(level='class', col_level=1) # 列索引
# Fill in the specified name does not exist for the level name
df.reset_index(level='class', col_level=1, col_fill='species')
索引類型
# RangeIndex: Immutable index of the single-adjustment range.
# Int64Index: int64類型,Immutability of sorted sliceable collections ndarray.
# UInt64Index: Unsigned integer labels
# Float64Index: Float64 類型
# CategoricalIndex 類別索引
pd.RangeIndex(1,100,2)
# RangeIndex(start=1, stop=100, step=2)
pd.Index([1, 2, 3])
# Int64Index([1, 2, 3], dtype='int64')
pd.Int64Index([1,2,3,-4], name='num') # v2.0 將棄用
# Int64Index([1, 2, 3, -4], dtype='int64', name='num')
pd.UInt64Index([1,2,3,4]) # v2.0 將棄用
# UInt64Index([1, 2, 3, 4], dtype='uint64')
pd.Float64Index([1.2,2.3,3,4]) # v2.0 將棄用
# Float64Index([1.2, 2.3, 3.0, 4.0], dtype='float64')
pd.CategoricalIndex(['a', 'b', 'a', 'b']) #類別索引
# CategoricalIndex(['a', 'b', 'a', 'b'], categories=['a', 'b'], ordered=False, dtype='category')
#間隔索引
#間隔索引
pd.interval_range(start=0, end=5,closed='left') ##IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], 默認closed='right', dtype='interval[int64]')
#多層索引
#多層索引
arrays = [[1, 1, 2, 2], ['red', 'yellow', 'red', 'blue']]
pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) #MultiIndex([(1, 'red'),(1, 'blue'),(2, 'red'),(2, 'blue')],names=['number', 'color'])
日期
# Consecutive from one date to another
pd.date_range(start='1/1/2018', end='1/08/2018')
# Specify the start time and period
pd.date_range(start='1/1/2018', periods=8)
#周期索引
# 以月為周期
pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
# Cycle nesting
pd.period_range(start=pd.Period('2017Q1', freq='Q'),end=pd.Period('2017Q2', freq='Q'), freq='M')
#PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], dtype='period[M]', freq='M')
#秒為周期
t = pd.period_range('2020-5-1 10:00:05', periods=8, freq='S')
pd.PeriodIndex(t,freq='S')
#Time difference index
pd.TimedeltaIndex(data =['06:05:01.000030', '+23:59:59.999999',
'22 day 2 min 3us 10ns', '+23:29:59.999999','+12:19:59.999999'])
# 使用 datetime
pd.TimedeltaIndex(['1 days', '1 days, 00:00:05',
np.timedelta64(2, 'D'), datetime.timedelta(days=2, seconds=2)])
#創建索引
pd.Index([1, 2, 3]) # Int64Index([1, 2, 3], dtype='int64')
pd.Index(list('abc')) # Index(['a', 'b', 'c'], dtype='object')
pd.Index(['e', 'd', 'a', 'b'], name='something') # 指定索引名稱 name
df.index # RangeIndex(start=0, stop=4, step=1)
df.columns # Index(['month', 'year', 'sale'], dtype='object')
#The following method also works df.columns, 因為都是 index 對象:
df.index.name # 名稱
df.index.array # array 數組
df.index.dtype # 數據類型
df.index.shape # 形狀
df.index.size # 元素數量
df.index.values # array 數組
# 其他,不常用
df.index.empty # 是否為空
df.index.is_unique # Does it not repeat
df.index.names # 名稱列表
df.index.is_all_dates # Is it all datetime
df.index.has_duplicates # 是否有重復值
df.index.values # 索引的值 array
#The following method also works df.columns
f.index.astype('int64') # 轉換類型
df.index.isin() # 是否存在,見下方示例
df.index.rename('number') # 修改索引名稱
df.index.nunique() # 不重復值的數量
df.index.sort_values(ascending=False,) # 排序,倒序
df.index.map(lambda x:x+'_') # map 函數處理
df.index.str.replace('_', '') # str 替換
df.index.str.split('_') # 分隔
df.index.to_list() # 轉為列表
df.index.to_frame(index=False, name='a') # 轉成 DataFrame
df.index.to_series() # 轉 series
df.index.to_numpy() # 轉為 numpy
df.index.unique() # 去重
df.index.value_counts() # Weight and quantity
df.index.where(df.index=='a') # 篩選
df.index.rename('grade', inplace=False) # Rename the index name
df.index.rename(['species', 'year']) # 多層,Rename the index name
df.index.max() # 最大值
df.index.argmax() # 最大索引值
df.index.any() #The function checks if any element in the index is true
df.index.all()
df.index.T # 轉置,Useful in multi-level indexes
# 其他,不常用
df.index.append(pd.Index([4,5])) # 追加
df.index.repeat(2) # 重復幾次
df.index.inferred_type # Inferred data type
df.index.hasnans # 有沒有空值
df.index.is_monotonic_decreasing # 是否單調遞減
df.index.is_monotonic # 是否單調遞增
df.index.is_monotonic_increasing # 是否單調遞增
df.index.nbytes # The number of bytes in the underlying data
df.index.ndim # 維度數,維數
df.index.nlevels # The number of index levels,通常為 1
df.index.min() # 最小值
df.index.argmin() # 最小索引值
df.index.argsort() # An array of sequential values
df.index.asof(2) # Returns the most recent index
# numpy dtype or pandas type
df.index.astype('int64', copy=True) # 深拷貝
# 拷貝
df.index.copy(name='new', deep=True, dtype='int64')
df.index.delete(1) # 刪除指定位置
# 對比不同
df.index.difference(pd.Index([1,2,4]), sort=False)
df.index.drop('a', errors='ignore') # 刪除
df.index.drop_duplicates(keep='first') # 去重值
df.index.droplevel(0) # Delete hierarchy
df.index.dropna(how='all') # 刪除空值
df.index.duplicated(keep='first') # The duplicate values are in the result array as True
df.index.equals(df.index) # Is it the same as another index object
df.index.factorize() # 分解成(array:0-n, Index)
df.index.fillna(0, {0:'nan'}) # 填充空值
# 字符列表, 把 name The value is added first, 每個值加10
df.index.format(name=True, formatter=lambda x:x+10)
# 返回一個 array, An indexed bit array of the specified value,不在的為 -1
df.index.get_indexer([2,9])
# 獲取 指定層級 Index 對象
df.index.get_level_values(0)
# 指定索引的位置,見示例
df.index.get_loc('b')
df.index.insert(2, 'f') # at the index bit 2 插入 f
df.index.intersection(df.index) # 交集
df.index.is_(df.index) # 類似 is 檢查
df.index.is_categorical() # Whether to classify the data
df.index.is_type_compatible(df.index) # 類型是否兼容
df.index.is_type_compatible(1) # 類型是否兼容
df.index.isna() # array 是否為空
df.index.isnull() # array whether there are missing values
df.index.join(df.index, how='left') # 連接
df.index.notna() # value that does not exist
df.index.notnull() # value that does not exist
df.index.ravel() # flattened valuendarray
df.index.reindex(['a','b']) # 新索引 (Index,array:0-n)
df.index.searchsorted('f') # If insert this value at which index bit after sorting
df.index.searchsorted([0, 4]) # array([0, 3]) 多個
df.index.set_names('quarter') # 設置索引名稱
df.index.set_names('species', level=0)
df.index.set_names(['kind', 'year'], inplace=True)
df.index.shift(10, freq='D') # The date index is moved forward 10 天
idx1.symmetric_difference(idx2) # The two index different content
idx1.union(idx2) # 拼接
df.add_prefix('t_') # Prefix the header
df.add_suffix('_d') # Add a suffix to the header
df.first_valid_index() # The first index with a value
df.last_valid_index() # The last index with a value
# The new index of the value set by the mask, 小於 10 的變為 0
df.index.putmask(df.index<10, 0)
s.rename_axis("animal") # 索引重命名
df.rename_axis(["dow", "hr"]) # Multi-level index index name modification
df.rename_axis('info', axis="columns") # Modify the row index name
# Modify the multi-level row index name
df.rename_axis(index={'a': 'A', 'b': 'B'})
# Modify the multi-level column index name
df.rename_axis(columns={'name': 's_name', 'b': 'B'})
df.rename(columns={'name': 's_name', 'b': 'B'},inplace=True) #修改列名
df.rename_axis(columns=str.upper) # Row index names are capitalized
# One-to-one correspondence to modify the column index
df.rename(columns={"A": "a", "B": "c"})
df.rename(str.lower, axis='columns')
# 修改行索引
df.rename(index={0: "x", 1: "y", 2: "z"})
df.rename({1: 2, 2: 4}, axis='index')
# 修改數據類型
df.rename(index=str)
# Modify the index again
replacements = {l1:l2 for l1, l2 in zip(list1, list2)}
df.rename(replacements)
# Prefix the column name
df.rename(lambda x:'t_' + x, axis=1)
# 利用 iter() 函數的 next 特性修改
df.rename(lambda x, y=iter('abcdef'): next(y), axis=1)
# 修改列名,Generate a dictionary of old and new fields in unpacked form
df.rename(columns=dict(zip(df, list('abcd'))))
# 修改索引
df.set_axis(['a', 'b', 'c'], axis='index')
# 修改列名
df.set_axis(list('abcd'), axis=1)
# 使修改生效
df.set_axis(['a', 'b'], axis='columns', inplace=True)
# Pass in the index content
df.set_axis(pd.Index(list('abcde')), axis=0)
# idx.isin() 是否存在
idx = pd.Index([1,2,3])
df.index.isin(idx)
# array([False, False, False, False])
df.index.isin(['a','b'])
# array([ True, True, False, False])
midx = pd.MultiIndex.from_arrays([[1,2,3],['red', 'blue', 'green']],names=('number', 'color'))
midx.isin([(1, 'red'), (3, 'red')])
# array([ True, False, False])
dates = ['2000-03-11', '2000-03-12', '2000-03-13']
dti = pd.to_datetime(dates)
dti.isin(['2000-03-11'])
# array([ True, False, False])
# i.argsort() 排序
# Integer index by which the index will be sorted,見下文示例
idx = pd.Index(['b', 'a', 'd', 'c'])
order = idx.argsort() # array([1, 0, 3, 2])
idx[order] # Index(['a', 'b', 'c', 'd'], dtype='object')
# i.asof(2) Returns the most recent index, 支持日期,It is possible to find the latest date
# Return the label from the index;如果不存在,returns the previous label
idx2 = pd.Index([1,3,6])
idx2.asof(5) # 3
idx2.asof(6) # 6
idx2.asof(-1) # nan
# index.get_loc 指定索引的位置,見示例
unique_index = pd.Index(list('abc'))
unique_index.get_loc('b') # 1
monotonic_index = pd.Index(list('abbc'))
monotonic_index.get_loc('b') # slice(1, 3, None)
non_monotonic_index = pd.Index(list('abcb'))
non_monotonic_index.get_loc('b')
# array([False, True, False, True], dtype=bool)