| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- import pandas as pd
- import numpy as np
- # 1、读取数据
- path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
- column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
- 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
- 'Normal Nucleoli', 'Mitoses', 'Class']
- data = pd.read_csv(path, names=column_name)
- data.head()
- # 2、缺失值处理
- # 1)替换-》np.nan
- data = data.replace(to_replace="?", value=np.nan)
- # 2)删除缺失样本
- data.dropna(inplace=True)
- data.isnull().any() # 不存在缺失值
- # 3、划分数据集
- from sklearn.model_selection import train_test_split
- data.head()
- # 筛选特征值和目标值
- x = data.iloc[:, 1:-1]
- y = data["Class"]
- x.head()
- y.head()
- x_train, x_test, y_train, y_test = train_test_split(x, y)
- x_train.head()
- # 4、标准化
- from sklearn.preprocessing import StandardScaler
- transfer = StandardScaler()
- x_train = transfer.fit_transform(x_train)
- x_test = transfer.transform(x_test)
- x_train
- from sklearn.linear_model import LogisticRegression
- # 5、预估器流程
- estimator = LogisticRegression()
- estimator.fit(x_train, y_train)
- # 逻辑回归的模型参数:回归系数和偏置
- estimator.coef_
- estimator.intercept_
- # 6、模型评估
- # 方法1:直接比对真实值和预测值
- y_predict = estimator.predict(x_test)
- print("y_predict:\n", y_predict)
- print("直接比对真实值和预测值:\n", y_test == y_predict)
- # 方法2:计算准确率
- score = estimator.score(x_test, y_test)
- print("准确率为:\n", score)
- # 查看精确率、召回率、F1-score
- from sklearn.metrics import classification_report
- report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])
- print(report)
- y_test.head()
- # y_true:每个样本的真实类别,必须为0(反例),1(正例)标记
- # 将y_test 转换成 0 1
- y_true = np.where(y_test > 3, 1, 0)
- from sklearn.metrics import roc_auc_score
- roc_auc_score(y_true, y_predict)
- #Output 0.9562646028037384
|