您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Pandas data analysis 02 - reading and exporting various files

編輯：Python

Bibliography ：《 Explain profound theories in simple language Pandas： utilize Python Data processing and analysis 》

pandas It's really powerful , Almost any format of data Can read , what csv,excel,spss,stata,json,html...... Even the data on the clipboard can be read ..... This chapter teaches you how to read data , Although the simple reading is just one sentence , But there are still many parameters and functions , Have a look .

Read CSV file

csv The most basic data file , The most detailed introduction , Because many parameters of other files and csv The reading methods are similar

import pandas as pd
import numpy as np
# File directory 
pd.read_csv('data.csv') # If the file is in the same directory as the code file 
pd.read_csv('D:\AAA Recent want to use \ Deep learning \ Own project \ Wind power generation \ A little less .csv') # Specify the directory 
pd.read_csv('data\my\my.data') # CSV The file extension is not necessarily csv
# Use the web address url
pd.read_csv('https://www.gairuo.com/file/data/dataset/GDP-China.csv')

Read from the string

# You can also get it from StringIO Read from 
from io import StringIO
data = ('col1,col2,col3\n'
'a,b,1\n'
'a,b,2\n'
'c,d,3')
pd.read_csv(StringIO(data)).to_csv('data.data',index=False) # Save it

Separator

# Data separation conversion is comma , If it is others, you can specify 
pd.read_csv('data.data', sep='\t') # Tab delimited tab
pd.read_table('data.data') # read_table The default is tab delimitation tab
pd.read_csv('data.data', sep='|') # Tab delimited tab

Specify column names and headers

# Specify column names and headers 
pd.read_csv(data, header=0) # Default first line 
pd.read_csv(data, header=None) # There is no meter 
pd.read_csv(data, names=[' Column 1', ' Column 2']) # Specify a list of column names 
pd.read_csv(data, names=[' Column 1', ' Column 2'],header=None) # Specify a list of column names 
# If not listed , Automatically assign one : Prefix plus ordinal number 
pd.read_csv(data, prefix='c_', header=None) # Header is c_0,c_1...

Specify the index

# Specify the index 
pd.read_csv(data,index_col=False) # Do not use the first column as an index 
pd.read_csv(data,index_col=0) # Which column is the index 
pd.read_csv(data,index_col=' year ')# Specifies the column name 
pd.read_csv(data,index_col=['a','b']) # Multiple indexes 
pd.read_csv(data,index_col=[0,3]) # Multiple index

Read Partial Columns

# Read Partial Columns 
pd.read_csv(data, usecols=[0,4,3]) # Read only the specified columns by index , The order is irrelevant 
pd.read_csv(data, usecols=[' Column 1', ' Column 5']) # Read only the specified columns by index 
# Specify the column order , It's actually df The screening function of 
pd.read_csv(data, usecols=[' Column 1', ' Column 5'])[[' Column 5', ' Column 1']]
pd.read_csv(data, index_col=0) # Which column is the index 
# The following is used callable The order can be cleverly specified , in Next is the order we want 
pd.read_csv(data, usecols=lambda x: x.upper() in ['COL3', 'COL1'])

Handle duplicate column names

# Handle duplicate column names 
data='a,b,a\n0,1,2\n5,6,4'
pd.read_csv(StringIO(data),mangle_dupe_cols=True)

data type

# data type 
data = 'https://www.gairuo.com/file/data/dataset/GDP-China.csv'
# Specify the data type 
pd.read_csv(data, dtype=np.float64) # All data is of this data type 
pd.read_csv(data, dtype={
'c1':np.float64, 'c2': str}).info() # Specify the type of field 
pd.read_csv(data, dtype=[datetime,datetime,str,float]) # Specify in turn

Skip the specified line

# Skip the specified line 
pd.read_csv(data,skiprows=2) # Skip the first two lines 
pd.read_csv(data,skiprows=range(2)) # Skip the first two lines 
pd.read_csv(data,skiprows=[24,235,65]) # Skip the specified line 
pd.read_csv(data,skiprows=np.array([24,235,65])) # Skip the specified line 
pd.read_csv(data,skiprows=lambda x:x%2!=0) # Skip every other line 
pd.read_csv(data,skipfooter=1) # Skip from the tail 
pd.read_csv(data,skip_black_lines=True)# Skip empty lines

Reads the specified number of rows

# Reads the specified number of rows
pd.read_csv(data,nrows=1000)

Null value substitution

# Null value substitution
pd.read_csv(data,na_values=[0]) #0 It will be considered as missing value
pd.read_csv(data,na_values='?') #? It will be considered as missing value
pd.read_csv(data,na_values='abc') #abc Will be considered missing value Equivalent ['a','b','c']
pd.read_csv(data,na_values={'c':3,1:[2,5]}) # The specified column is specified as NaN

Parse date and time

# Parse date and time
data = 'D:/AAA Recent want to use / mathematical modeling /22 Mercer / data /LBMA-GOLD.csv'
pd.read_csv(data, parse_dates=True).info()# Automatically parse date and time format
pd.read_csv(data, parse_dates=['Date']).info() # Specify the date and time field to parse
# take 0,1,2 The column merge resolves to a name The time of the Time type column
pd.read_csv('D:/AAA Recent want to use / Deep learning / Own project / Wind power generation / A little less .csv', parse_dates={' Time ':[0,1,2]}).info()
# Keep the original column
pd.read_csv('D:/AAA Recent want to use / Deep learning / Own project / Wind power generation / A little less .csv', parse_dates={' Time ':[0,1,2]},keep_date_col=True).info()
# Data with date before month
pd.read_csv(data,dayfist=True,parse_dates=[0])

# Specify the time resolution Library , The default is dateutil.parser.parser
date_parser=pd.io.date_converters.parse_date_time
date_parser=lambda x: pd.to_datetime(x, utc=True, format='%D%M%Y')
date_parser=lambda d: pd.datetime.strptime(d,'%d%b%Y')
pd.read_csv(data, date_parser=date_parser)
# Try converting to date
pd.read_csv(data, date_parser=date_parser,infer_datetime_format=True)

Read compressed package

# Read compressed package
pd.read_csv('sample.tar.gz',compression='gzip')
# Specify the read encoding
pd.read_csv('gairuo.csv',encoding='utf-8')
pd.read_csv('gairuo.csv',encoding='gk2312')# chinese

Symbol

# Symbol
pd.read_csv('test.csv',thousands=',')# The thousandth separator
pd.read_csv('test.csv',decimal=',,')# decimal point , Default '.'
pd.read_csv(StringIO(data),escapechar='\n',encoding='utf-8')# Filter line breaks
pd.read_csv(StringIO(s),sep=',',comment='#',skiprows=1) # The line has '#' Will skip

csv File export

df.to_csv('done.csv')
df.to_csv('data/done.csv') # You can specify the file directory path
df.to_csv('done.csv', index=False) # Don't index
f.to_csv('done.csv', encoding='utf-8') # Specified encoding
# You can also use sep Specify the separator
df.Q1.to_csv('Q1_test.txt', index=None) # Specify a column to export txt Format file

Export compressed package

df=pd.read_csv('https://www.gairuo.com/file/data/dataset/GDP-China.csv')
# Create a containing out.csv The compressed file of out.zip
com_opts=dict(method='zip',archive_name='out.csv')
df.to_csv('out.zip',encoding='gbk',index=False,compression=com_opts)

Read Excel

excel You need this front-end package openpyxl

pd.read_excel('team.xlsx')
pd.read_excel('https://www.gairuo.com/file/data/dataset/team.xlsx')

excel Multiple tables sheet

#excel Multiple tables sheet
xlsx = pd.ExcelFile('data.xlsx')
df = pd.read_excel(xlsx, 'Sheet1') # Read
xlsx.parse('sheet1') # Take the specified label as DataFrame
# Excel All tags for
xlsx.sheet_names # ['sheet1', 'sheet2', 'sheet3', 'sheet4']

Specify read sheet

# Specify read sheet
pd.read_excel('team.xlsx', sheet_name=1) # the second sheet
pd.read_excel('team.xlsx', sheet_name=' summary ') # name
# Read multiple sheet
pd.read_excel('team.xlsx', sheet_name=[0,1,'Sheet5'])# Read the first and second , The fifth one sheet, form df Dictionaries
dfs=pd.read_excel('team.xlsx', sheet_name=None) # all sheet
dfs['Sheet5']

execl export

# Use ExcelFile Save file object
df.to_excel('file.xlsx',sheet_name='sheet2',index=False)

Multiple sheet

df1=df.describe()
# You can put more than one Sheet Deposit in ExcelFile
with pd.ExcelWriter('path_to_file.xlsx') as xls:
df1.to_excel(xls, 'Sheet1')
df.to_excel(xls, 'Sheet2',index=False)

Web page file reading

dfs = pd.read_html('https://www.gairuo.com/p/pandas-io')
dfs[0] # Look at the first one df

dfs = pd.read_html('data.html', header=0) # Read web files , The first line is the header
dfs = pd.read_html(url, index_col=0)# The first column is the index
# Many tables can specify elements to get
dfs1 = pd.read_html(url, attrs={'id': 'table'}) # id='table' Table for , Note that it is still possible to return multiple
# dfs1[0]
dfs2 = pd.read_html(url, attrs={'class': 'sortable'})# class='sortable'
# !!! Common functions and read_csv identical , Please refer to the above

Web page file export

df.to_html(' Web page file .html')
df.to_html(' Web page file .html',columns=[0])# Output specified column
df.to_html(' Web page file .html',classes=['class1','class2'])# Output the specified style

json Read

pd.read_json('data.json')
json = '''{"columns":["col 1","col 2"],
"index":["row 1","row 2"],
"data":[["a","b"],["c","d"]]}
'''
pd.read_json(json)
pd.read_json(json, orient='split') # json Format

''' orient Support ：

'split' : dict like {index -> [index], columns -> [columns], data -> [values]}
'records' : list like [{column -> value}, ... , {column -> value}]
'index' : dict like {index -> {column -> value}}
'columns' : dict like {column -> {index -> value}} '''

json Output

df = pd.DataFrame([['a', 'b'], ['c', 'd']],
index=['row 1', 'row 2'],
columns=['col 1', 'col 2'])
# Output json character string
df.to_json(orient='split')

Clipboard read

For example, I'll copy a data from Oriental Fortune ：

And then in Python Input

pd.read_clipboard(header=None)

You can get

Very convenient

Clipboard output

df = pd.DataFrame({'A': [1, 2, 3],
'B': [4, 5, 6],
'C': ['p', 'q', 'r']},
index=['x', 'y', 'z'])
df.to_clipboard()

So we can take df Things have been copied everywhere

markdowm export

print(df.to_markdown())

Read other files

Data analysis and spss file ,sas file ,stata Documents, etc. , Both can be used. pandas Read

pd.read_stata('file.dta')# Read stata file
pd.read_spss('file.sav')# Read spss file
pd.read_sas# Read sas
pd.read_sql# Read sql
# Read data with fixed column width
colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
pd.read_fwf('demo.txt', colspecs=colspecs, header=None, index_col=0)