Self taught programming series - getting started with 5 pandas
編輯:Python
pandas Learning from
5.1 Series data
5.2 DataFrame establish
summary : Create method
5.3 Index object
5.4 pandas Basic function
5.5 Descriptive statistics
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pandas_datareader.data as web
5.1 Series data
Indexes
operation : Automatic alignment
Missing value 、 name
# Series Composed of one-dimensional data and indexes, such as time series data , The index is time
series1 = pd.Series([1,3,5,7])
print(series1)
# Custom index
series2 =pd.Series([1,3,4,6],index=['a','b','c','d'])
series2
series2.index
# Select by index Series The value in
value1 = series2['c']
value2 = series2[['a','d','b']]
value2
# Series Arithmetic : similar numpy
series3 = series2[series2>5]
print(series3)
series4 = series2*2
print(series4)
series5 = np.exp(series2)
print(series5)
# Can be series Think of it as an ordered dictionary
# You can use a dictionary to create series
print('a' in series2)
sdata = {
'zhao':1000,'qian':2000,'sun':3000,'zhou':4000}
series6 = pd.Series(sdata)
print(series6)
# The index can also be specified , Arrange the keys of the dictionary in the desired order
keys = ['sun','zhao','qian','li']
series7 = pd.Series(sdata,index=keys) # Not a key keys The value is NaN
# Is a key but is not included in keys Directly delete , No longer in the sequence
series7
# missing data
pd.isnull(series7) #pd The function in
pd.notnull(series7)
series7.isnull() # series Example method
series7.notnull()
# pd Automatically align data according to index
result = series6+series7
print(result) # As long as there is one missing value , So the sum is also the missing value
# name attribute
series7.name = 'salary' # Sequence name
series7.index.name = 'name'
series7
5.2 DataFrame establish
One group has a sequence : Row index + Column index ,series A dictionary made up of
It is usually one or more two-dimensional blocks
# establish DataFrame Generally, it is to directly pass in a dictionary composed of lists or arrays
data = {
'state':['ohio','ohio','Nevada','Neveda','Nevada','wang'], # Must be equal in length
'year':[2000,2001,2002,2003,2003,2004],
'pop': [1.5,1.7,3.6,2.4,2.9,3.0]
}
frame = pd.DataFrame(data)
frame
# Be similar to series, If the incoming column cannot be found in the data , A missing value is generated
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four', 'five','six'])
print(frame2.columns)
print(frame2)
# from dataframe Get sequence Like a dictionary or attribute
print(frame2.year) # Must conform to python Named features
print(frame2['pop']) # More practical
# The value of the column can be modified by assignment
frame2['debt'] = 14.1
print(frame2)
frame2['debt'] = np.arange(6.0) # np You can add decimals to range It can only be integers
print(frame2)
# The columns returned by the index are views of the data , Not a copy , Therefore, the modified data will be directly reflected on the original data
# del You can delete a column
# First create a column that contains Boolean values
frame2['eastern'] = frame2.state == 'ohio'
print(frame2)
del frame2['eastern']
print(frame2)
# Another kind DataFrame Create format
# Pass in nested Dictionary
pop = {
'Nevada':{
2001:2.4,2002:2.9},
'ohio':{
2000:1.5,2001:1.7,2002:3.6}
}
frame3 = pd.DataFrame(pop)
print(frame3) # The inner key acts as a line , The outer layer of the key as a column
# The keys of the inner dictionary form an index
# You can also specify index rows
df = pd.DataFrame(pop,index=[2001,2002,2003])
print(df)
# dataframe The transpose
print(frame3.T)
# Dictionaries can also be written by series form
pdata = {
'ohio': frame3['ohio'][:-1],
'Nevada': frame3['Nevada'][:2]
}
df = pd.DataFrame(pdata)
print(df)
# A list of dictionaries : Contrary to nested dictionaries Belongs to no specified row index
sdata = [{
'name': 'wang',
'age':12},{
'name':'liu',
'age':22
}]
df = pd.DataFrame(sdata)
print(df)
summary : Create method
Two dimensional array
By an array of 、 list 、 Tuples 、series A dictionary made up of , Each element becomes a column
Nested Dictionary , The inner dictionary becomes 1 Column , Keys are merged into row indexes
Dictionary or series A list of : Each item is called DF A line , Dictionary keys and indexes are called lists
Dictionaries 、 A list of tuples
the other one DF
# Set row index name , Column index name
df.index.name = 'year'
df.columns.name='features'
print(df)
print(df.values) # Will return as an array
frame2.values # If it contains many types of data , Will specify a compatible data type
5.3 Index object
pandas use index Object to define data such as axis labels and axis names
The tags of the array or sequence used will be converted to index object
The index object cannot be modified after it is determined
You can use python The way to assemble
obj = pd.Series(range(3),index=['a','b','c'])
index = obj.index
index[1:]
# The index object cannot be modified after it is determined
# index[1] = 'd'
# TypeError: Index does not support mutable operations
# You can create your own indexes and then create them in different places DF Use in
labels = pd.Index(np.arange(3))
labels
obj2 = pd.Series([1.5,-2.5,0],index=labels)
obj2.index is labels
frame3
frame3.columns
# Be similar to python Set operations , But it can contain the same elements
labels_index = pd.Index(['foo','foo','bar','bar'])
labels1 = labels_index.append(labels) # Add to another index In the object
print(labels1)
labels2 = labels1.difference(labels_index) # Difference set
print(labels2)
labels3 = labels1.intersection(labels) # intersection
print(labels3)
# union and
labels3.delete(2) # Delete index
labels3.drop(1) # Delete value
# insert Insert value into index i It's about
# unique Calculate the unique set
5.4 pandas Basic function
Re index : Whatever the original index is , Find the interested data directly
Remove a piece of data : Default line , The shaft can be replaced
Sequence 、DF The index of 、 section
Label operators
Considerations for integer indexing :[-1]
Index alignment and arithmetic operations
Series And DF Operation between : All right 、 All columns 、 radio broadcast
Function application
obj = pd.Series([4.5,7.2,5.3,3.6],index=['d','b','a','c'])
print(obj)
# reindex Re index
obj2 = obj.reindex(['a','b','c','d','e'])
print(obj2)
# Interpolation processing of time series
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
print(obj3)
obj3.reindex(range(6),method='ffill')
print(obj3) #ffill The original object will not be modified index Will change the original data view
print(obj3.reindex(range(6),method='ffill'))
# about DataFrame similar
df = pd.DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],columns=['ohio','california','texas'])
print(df)
df1 = df.reindex(['a','b','c','d'])
print(df1)
# colums You can also re index
states = ['ohio','utah','california']
df2 = df.reindex(columns=states)
print(df2)
# Discard items on an axis : Sequence
obj = pd.Series(np.arange(5.),index=['a','b','c','d','e'])
print(obj)
obj1 = obj.drop('c')
print(obj1)
obj2 = obj.drop(['b','c']) # Pass in parameters as a list
print(obj2)
# DataFrame
df1 = pd.DataFrame(
np.arange(16).reshape(4,4),
index = ['ojio','colorado','utah','newyork'],
columns=['one','two','three','four']
)
print(df1)
df2 = df1.drop(['colorado','utah']) # The default is index , Stored drop object
print(df2)
print(df1)
df3 = df1.drop(['one','three'],axis=1) # Fixed axis direction , You can delete columns
df4 = df1.drop(['two','four'],axis='columns') # Equivalent operation
print(df3)
print(df4)
# drop Functional inplace Parameter can operate on the original data
df5 = df1.drop(['colorado','utah'],inplace=True)
print(df5) # Returns the deleted object
print(df1) # Changes have been made to the original data , Deleted some data , Use caution
# Sequence index
obj = pd.Series(np.arange(4.0),index=['a','b','c','d'])
print(obj['b'])
print(obj[['b','d']])
print(obj[[2,3]])
print(obj[obj<2])
# Sequence slice
print(obj['b':'d']) # And python Different ends are included
print(obj[2:3]) # python
obj[2:3] =5 #python
print(obj)
# DataFrame Indexes 、 section
df1 = pd.DataFrame(
np.arange(16).reshape((4,4)),
index=['wang','liu','zhao','qian'],
columns=['one','two','three','four'])
print(df1['one'])
print(df1[['one','four']])
print(df1[:2])
print(df1[df1['three']>5]) #>5 All of the line , similar excel
# Boolean value
df1[df1<5] = 0 # Direct pair df1 Make changes
print(df1)
# Label operators loc and iloc
df1.loc['wang',['one','three']] # similar numpy Select rows and columns
df1.loc[['wang','zhao'],['one','two']] # Select a submatrix
df1.iloc[2,[3,0,1]] # Select by integer index
df1.iloc[:,:3] # similar numpy
# Select a scalar at a certain position
df1.at['wang','one'] # You must enter a label
df1.iat[0,0] # Enter integer index
# Integer indexes are error prone
ser1 = pd.Series(np.arange(3.0))
ser1
ser1[1] # Will report a mistake , Different from list and tuple index Syntax
# Use non integer indexes
ser1.index=['a','b','c']
ser1
ser1[-1] # Can index normally
# So we use iloc Index
ser1.index=[0,1,2]
ser1
ser1.loc[:1] # Index by tag is 0 and 1 Of index
ser1.iloc[:1] # Press python Index to index 0 Of index
# Index alignment
# The index is union , But it's worth : Arithmetic operation NaN +num = NaN
s1 = pd.Series([7.3,-2.5,3.4, 1.5],index=['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index = ['a','c','e','f','g'])
s1
s2
s1+s2
# about DataFrame Alignment occurs on rows and columns
df1 = pd.DataFrame(
np.arange(9.0).reshape((3,3)),columns=list('bcd'),
index = ['ohio','texas','colorado']
)
df2 = pd.DataFrame(
np.arange(12.0).reshape((4,3)),columns=list('bde'),
index= ['utah', 'ohio','texas','oregon']
)
df1
df2
df1+df2
# The index does not have a value of intersection
# If there are no common row and column labels , Then the result is empty
df3 = pd.DataFrame({
'A':[1,2]})
df4 = pd.DataFrame({
'B':[3,4]})
df3
df4
df3+df4
# DataFrame The arithmetic method of
# Use add Method , Missing index with 0 fill
df1 = pd.DataFrame(
np.arange(12.0).reshape((3,4)),
columns=list('abcd'))
df2 = pd.DataFrame(
np.arange(16.0).reshape((4,4)),
columns= list('bcde')
)
print(df1)
print(df2)
df1+df2
df1.add(df2,fill_value=0) # It can not solve the problem that both have missing values
df1.radd(df2,fill_value=0) # Inverse method radd
df1.sub(df2,fill_value=0) # Subtraction df1-df2
df1.rsub(df2,fill_value=0) # Inverse subtraction df2-df1
# div rdiv division floordiv rfloordiv to be divisible by mul rmul Multiplication pow,rpow chengfang
# Re index
df1.reindex(columns=df2.columns,fill_value=0)
# DataFrame And Series Operation between : The default matching column to row operation
# The difference between a two-dimensional array and one of its rows
arr = np.arange(12.).reshape((3,4))
print(arr)
arr[0]
arr-arr[0] # This is done for each row : radio broadcast
# DataFrame And Series
df1 = pd.DataFrame(
np.arange(12.).reshape((4,3)),
columns=list('abc'),
index= ['utah','ohio','texas','oregon']
)
series1 = df1.iloc[0]
print(df1)
series1
# By default , The arithmetic operation will series The index of matches to the column , Operate on each line
df1 - series1
# If an index is in DataFrame or series Does not exist in the , Then the index will form a union
series2 = pd.Series(range(3),index=list('acd'))
df1 - series2
# To match rows and columns, you need to use function methods
series3 = df1['a']
print(df1)
series3
df1.sub(series3,axis='index') # Pass in the axis index you want to match
# Function application
# abs
df = pd.DataFrame(
np.random.randn(4,3),columns=list('bde'),
index = ['utah','hoio','texas','oregon']
)
print(df)
np.abs(df) # It can be used numpy Function method of
# Apply functions to rows or columns to form a one-dimensional array : Take the extreme value of each column
f = lambda x: x.max() - x.min()
df.apply(f)
# Set the operation axis to columns
df.apply(f,axis='columns')
f = lambda x: x.sum()
df.apply(f,axis='index') #sum mean It 's all very easy to do
# f You can also return Series Function of
def f(x):
return pd.Series([x.min(),x.max()],index=['min','max'])
df.apply(f) # This allows you to write descriptive statistical functions
# Element level functions
f_str = lambda x: '%.2f' % x # Get the format string
df.applymap(f_str) # Operations on all elements
df['e'].map(f_str)
# Sort
# Sort index
ser1 = pd.Series(range(4),index=['d','a','b','c'])
print(ser1)
ser1.sort_index()
df = pd.DataFrame(
np.arange(8).reshape(2,4),index=['three','one'],
columns=['d','a','b','c'])
print(df)
df.sort_index()
df.sort_index(1)
df.sort_index(ascending=False) # The above is the default ascending sort , You can also sort in descending order
df.sort_index(axis=1,ascending=False)
# Sort values
ser1.sort_values(ascending=False)
df.sort_values(by='b') # Sort by a column
df1 = pd.DataFrame({
'b':[1,2,3,1],'c':[-2,-4,2,3]
})
df2 = df1.sort_values(by=['b','c'],ascending=[False,False]) # When b The columns are the same ,c Columns are arranged in descending order
df3 = df1.sort_values(by='b',ascending=False) # Usually use this , The result is different from the above
print(df2)
print(df3)
# ranking rank Method
# Give a ranking according to the value , The same value is the average ranking
ser1 = pd.Series([7,-5,7,4,2,0,4])
ser1.rank()
# The same value appears in the order of ranking
ser1.rank(method='first')
# Descending
ser1.rank(ascending=False,method='max') # The higher the same value, the higher the ranking
# min Take the lowest ranking
# Index of duplicate tags
series = pd.Series(range(5),index=['a','a','b','b','c'])
series
series.index.is_unique #is_unique attribute
# The index of the duplicate tag returns a series
series['a']
# This will make the data type of the index result not unique , Bring difficulties to data processing
# It is often assumed that the index is not duplicated , about DataFrame So it is with
df = pd.DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df
df.loc['a'] # Default index by column , add loc Can be like numpy Same index
5.5 Descriptive statistics
Based on the assumption that there is no missing data
df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns = ['one','two'])
print(df) # The two-dimensional array is labeled according to the original format
df.sum() # Returns the sum of each column ,series
df.sum(axis=1) # Returns the sum of each row , Default will ignore na
df.mean(axis='columns',skipna=False) # Set to false after , As long as there is one na It would be na
# Hierarchical index : Multiple indexes can be defined on one axis index=[['a','a','a','b',b',b','c','c','c','d','d','d'],[1,2,3,1,2,3,1,2,3]] Descriptive statistical functions utilize level Parameter grouping
# Returns the index
df.idxmax() # The row and column index corresponding to the maximum value of each column
# Add up
df.cumsum()
# Generate multiple totals at once
df.describe()
# For non numeric data
df1 = pd.Series(['a','a','b','c']*4)
df1.describe()
# Summary of descriptive statistical methods
# count Number of samples
# describe Series or describe Column summary data
# argmin argmax Calculate integer index position
# idxmin idxmax Calculate index value position
# quantile Calculate quantile (0,1)
# sum\mean\median
# mad Absolute deviation
# var/std/skew/kurt
# cumsum/cummin/cummax/cumprod
# diff First order difference
# pct_change Percentage change
# # Correlation coefficient and covariance
# all_data = {
# ticker:web.get_data_yahoo(ticker)
# for ticker in ['AAPL','IBM','MSFT','GOOG'] # Dictionary generator
# }
# price = pd.DataFrame(
# {ticker:data['Adj Close']
# for ticker,data in all_data.items()}
# )
# volumn = pd.DataFrame(
# {ticker:data['Volumn']
# for ticker,data in all_data.items()}
# )
# returns = price.pct_change()
# returns.tail()
# returns['MSFT'].corr(returns['AAPL'])
# returns.MSFT.corr(returns['AAPL'])
# returns['MSFT'.cov(returns['AAPL'])]
# returns.corr()
# retunrs.cov()
# # Other sequences can also be passed in
# returns.corrwith(returns.IBM)
# Pass in dataframe Calculate the correlation coefficient of the same column name
# returns.corrwith(volumn)
# The only value 、 frequency 、 Membership
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
# The only value
uniques = obj.unique()
uniques.sort() # Operate directly on the original sequence
uniques
# frequency
obj.value_counts() # Each value frequency
pd.value_counts(obj.values,sort=False) # Can be used with any array or sequence
# Membership isin
mask = obj.isin(['b','c'])
mask # amount to if sentence , Is equal to returns T, otherwise F
obj[mask]
# Give the integer index of each value of one array to another array Index(unique).get_indexer(match)
to_match = pd.Series(['c','a','b','b','c','a'])
unique_vals = pd.Series(['c','b','a'])
pd.Index(unique_vals).get_indexer(to_match)
# Similar to match function
data = pd.DataFrame(
{
'Qu1':[1,3,4,3,4],
'Qu2': [2,3,1,2,3],
'Qu3':[1,5,2,4,4]}
)
print(data)
result = data.apply(pd.value_counts).fillna(0) # frequency
print(result)