day02_machine.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import pandas as pd
  2. import numpy as np
  3. # 1、读取数据
  4. path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
  5. column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
  6. 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
  7. 'Normal Nucleoli', 'Mitoses', 'Class']
  8. data = pd.read_csv(path, names=column_name)
  9. data.head()
  10. # 2、缺失值处理
  11. # 1)替换-》np.nan
  12. data = data.replace(to_replace="?", value=np.nan)
  13. # 2)删除缺失样本
  14. data.dropna(inplace=True)
  15. data.isnull().any() # 不存在缺失值
  16. # 3、划分数据集
  17. from sklearn.model_selection import train_test_split
  18. data.head()
  19. # 筛选特征值和目标值
  20. x = data.iloc[:, 1:-1]
  21. y = data["Class"]
  22. x.head()
  23. y.head()
  24. x_train, x_test, y_train, y_test = train_test_split(x, y)
  25. x_train.head()
  26. # 4、标准化
  27. from sklearn.preprocessing import StandardScaler
  28. transfer = StandardScaler()
  29. x_train = transfer.fit_transform(x_train)
  30. x_test = transfer.transform(x_test)
  31. x_train
  32. from sklearn.linear_model import LogisticRegression
  33. # 5、预估器流程
  34. estimator = LogisticRegression()
  35. estimator.fit(x_train, y_train)
  36. # 逻辑回归的模型参数:回归系数和偏置
  37. estimator.coef_
  38. estimator.intercept_
  39. # 6、模型评估
  40. # 方法1:直接比对真实值和预测值
  41. y_predict = estimator.predict(x_test)
  42. print("y_predict:\n", y_predict)
  43. print("直接比对真实值和预测值:\n", y_test == y_predict)
  44. # 方法2:计算准确率
  45. score = estimator.score(x_test, y_test)
  46. print("准确率为:\n", score)
  47. # 查看精确率、召回率、F1-score
  48. from sklearn.metrics import classification_report
  49. report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])
  50. print(report)
  51. y_test.head()
  52. # y_true:每个样本的真实类别,必须为0(反例),1(正例)标记
  53. # 将y_test 转换成 0 1
  54. y_true = np.where(y_test > 3, 1, 0)
  55. from sklearn.metrics import roc_auc_score
  56. roc_auc_score(y_true, y_predict)
  57. #Output 0.9562646028037384