# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.cluster import KMeans
"""
programmer_1--> Some characteristics of the original data are described and saved as a new table , Use describe function
programmer_2--> Clean up the original data , Do some operations on some of the data , And save
programmer_3--> Standardize the data and save it
programmer_4--> Use KMeans Cluster the data
"""
def programmer_1():
datafile = 'data/air_data.csv'
resultfile = 'tmp/explore.xls'
data = pd.read_csv(datafile, encoding='utf-8')
# Including a basic description of the data ,percentiles The parameter is a quantile table that specifies how much to calculate ( Such as 1/4 quantile 、 Median, etc );T It's transposition , After transposing, it is more convenient to refer to
explore = data.describe(percentiles=[], include='all').T
# describe() Function automatically calculates the number of non null values , The number of null values needs to be calculated manually
explore['null'] = len(data) - explore['count']
explore = explore[['null', 'max', 'min']]
explore.columns = [u' Null number ', u' Maximum ', u' minimum value ']
''' Only part of the exploration results are selected here .