import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
SEED = 222
np.random.seed(SEED)
df = pd.read_csv('C:/Users/Administrator/Desktop/python.xiangmu/input.csv')
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
def get_train_test(test_size=0.95):
"""Split Data into train and test sets."""
y = 1 * (df.cand_pty_affiliation == "REP")
X = df.drop(["cand_pty_affiliation"], axis=1)
X = pd.get_dummies(X, sparse=True)
X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
return train_test_split(X, y, test_size=test_size, random_state=SEED)
xtrain, xtest, ytrain, ytest = get_train_test()
print("\nExample data:")
df.head()
C:\Users\Administrator\AppData\Local\Temp\ipykernel_15012\1264077475.py:23: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
IndexError Traceback (most recent call last)
Input In [1], in <cell line: 26>()
23 X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
24 return train_test_split(X, y, test_size=test_size, random_state=SEED)
---> 26 xtrain, xtest, ytrain, ytest = get_train_test()
28 # A look at the data
29 print("\nExample data:")
Input In [1], in get_train_test(test_size)
21 X = df.drop(["cand_pty_affiliation"], axis=1)
22 X = pd.get_dummies(X, sparse=True)
---> 23 X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
24 return train_test_split(X, y, test_size=test_size, random_state=SEED)
File D:\anaconda\lib\site-packages\pandas\core\indexes\base.py:5055, in Index.getitem(self, key)
5048 if com.is_bool_indexer(key):
5049 # if we have list[bools, length=1e5] then doing this check+convert
5050 # takes 166 µs + 2.1 ms and cuts the ndarray.getitem
5051 # time below from 3.8 ms to 496 µs
5052 # if we already have ndarray[bool], the overhead is 1.4 µs or .25%
5053 key = np.asarray(key, dtype=bool)
-> 5055 result = getitem(key)
5056 # Because we ruled out integer above, we always get an arraylike here
5057 if result.ndim > 1:
IndexError: boolean index did not match indexed array along dimension 0; dimension is 158 but corresponding boolean dimension is 2
表格文件
input.csv
cand_pty_affiliation cand_office_st cand_office cand_status rpt_tp transaction_tp entity_tp state classification cycle transaction_amt
REP US P C Q3 15 IND NY Engineer 2016 500
DEM US P C M5 15E IND OR Math-Stat 2016 50
DEM US P C M3 15 IND TX Scientist 2008 250
DEM US P C Q2 15E IND IN Math-Stat 2016 250
REP US P C 12G 15 IND MA Engineer 2016 184
DEM US P C M10 15 IND NM Engineer 2008 200
DEM US P C M7 15 IND MT Scientist 2012 500
DEM US P C M7 15 IND FL Engineer 2008 250
REP LA H C 12P 15 IND LA Engineer 2008 1000
DEM MD S C Q2 15 IND MD Scientist 2016 100
DEM HI S C Q2 15E IND FL Scientist 2014 250
REP US P C M3 15 IND AZ Engineer 2016 100
DEM FL S C 12G 15E IND CA Engineer 2016 100
REP US P C M3 15 IND TX Engineer 2016 25
REP US P C YE 15 IND WA Engineer 2008 700
DEM NJ S C Q2 15 IND NJ Engineer 2008 1000
DEM US P C M6 15E IND NY Engineer 2016 10
DEM FL S C 30G 15E IND TX Engineer 2016 250
REP VA S C Q3 15 IND VA Engineer 2014 250
DEM US P C M6 15 IND WA Engineer 2012 250