1、if else Concise usage
a=10
b=5
r="a Bigger " if a>b else "b Bigger "
2、 Use circularly
# String formatting
i=1
while i<=5:
print(" The present is the %d Secondary cycle "%(i))
print("i=%d"%i)
i+=1
#break usage
name=" The world cup is calling me "
for x in name:
print('----')
if x == ' stay '
break
print(x)
3、 Data cleaning notes
import numpy as np
import pandas as pd
df=pd.read_excel(r"D:\ Study \python Study \00 Courseware for the second phase of data analysis \WEEK9 Python Data cleaning \code&data\data\user_orders.xlsx")
df.head()
df.shape
df.index
df.info()
df.describe()# Default statistic type
df.isnull().sum()# Missing data sets ,axis=1, Yes
df.duplicated()# repeat
df.duplicated().sum()
# Extract certain rows and columns
df.loc[5:10,["name","age"]]
df1=df.set_index("name")#*******
df1.head()
df1.loc[" Li Xiaodan Li l",:]
df.iloc[:,0:5]
df.iloc[0:5]
df1.sample(frac=0.01)
df1.sample(n=2,axis=1)
df1.sample(n=2,random_state=420)# Randomly extract two lines
# Data processing —— data type
df3=df.copy()
df3['id']=df3['id'].astype('str')# Convert data type
df3.info()
df3['custom_amt']=df3['custom_amt'].str.strip('¥').astype('float')
df3['order_date']=pd.to_datetime(df3['order_date'],format="%Y year %m month %d Japan ")# Date formula
# Duplicate value processing
df3.drop_duplicates()
df3.drop_duplicates(inplace=True,ignore_index=True)
# Handling of outliers
df3.describe([0.99])
df3.loc[df3["age"]>=200,:]
df3=df3.drop(index=118)
df3.loc[~(df3["age"]>200),:]
# Missing value processing
df3.isnull().mean()# Ratio of missing values
df3.drop(columns='edu')
df3.loc[:,df3.isnull().mean()<=0.5]# The proportion of missing values excluded is greater than% 50
# The mean fills in the missing value
df3["age"].mean()
df3["age"].fillna(df3["age"].mean())
df3["age"].fillna(method='ffill')
# Text data processing