{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# 1、读取数据\n", "path = \"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data\"\n", "column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',\n", " 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',\n", " 'Normal Nucleoli', 'Mitoses', 'Class']\n", "\n", "data = pd.read_csv(path, names=column_name)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample code numberClump ThicknessUniformity of Cell SizeUniformity of Cell ShapeMarginal AdhesionSingle Epithelial Cell SizeBare NucleiBland ChromatinNormal NucleoliMitosesClass
010000255111213112
1100294554457103212
210154253111223112
310162776881343712
410170234113213112
\n", "
" ], "text/plain": [ " Sample code number Clump Thickness Uniformity of Cell Size \\\n", "0 1000025 5 1 \n", "1 1002945 5 4 \n", "2 1015425 3 1 \n", "3 1016277 6 8 \n", "4 1017023 4 1 \n", "\n", " Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size \\\n", "0 1 1 2 \n", "1 4 5 7 \n", "2 1 1 2 \n", "3 8 1 3 \n", "4 1 3 2 \n", "\n", " Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses Class \n", "0 1 3 1 1 2 \n", "1 10 3 2 1 2 \n", "2 2 3 1 1 2 \n", "3 4 3 7 1 2 \n", "4 1 3 1 1 2 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# 2、缺失值处理\n", "# 1)替换-》np.nan\n", "data = data.replace(to_replace=\"?\", value=np.nan)\n", "# 2)删除缺失样本\n", "data.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sample code number False\n", "Clump Thickness False\n", "Uniformity of Cell Size False\n", "Uniformity of Cell Shape False\n", "Marginal Adhesion False\n", "Single Epithelial Cell Size False\n", "Bare Nuclei False\n", "Bland Chromatin False\n", "Normal Nucleoli False\n", "Mitoses False\n", "Class False\n", "dtype: bool" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isnull().any() # 不存在缺失值" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# 3、划分数据集\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample code numberClump ThicknessUniformity of Cell SizeUniformity of Cell ShapeMarginal AdhesionSingle Epithelial Cell SizeBare NucleiBland ChromatinNormal NucleoliMitosesClass
010000255111213112
1100294554457103212
210154253111223112
310162776881343712
410170234113213112
\n", "
" ], "text/plain": [ " Sample code number Clump Thickness Uniformity of Cell Size \\\n", "0 1000025 5 1 \n", "1 1002945 5 4 \n", "2 1015425 3 1 \n", "3 1016277 6 8 \n", "4 1017023 4 1 \n", "\n", " Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size \\\n", "0 1 1 2 \n", "1 4 5 7 \n", "2 1 1 2 \n", "3 8 1 3 \n", "4 1 3 2 \n", "\n", " Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses Class \n", "0 1 3 1 1 2 \n", "1 10 3 2 1 2 \n", "2 2 3 1 1 2 \n", "3 4 3 7 1 2 \n", "4 1 3 1 1 2 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# 筛选特征值和目标值\n", "x = data.iloc[:, 1:-1]\n", "y = data[\"Class\"]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Clump ThicknessUniformity of Cell SizeUniformity of Cell ShapeMarginal AdhesionSingle Epithelial Cell SizeBare NucleiBland ChromatinNormal NucleoliMitoses
0511121311
15445710321
2311122311
3688134371
4411321311
\n", "
" ], "text/plain": [ " Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape \\\n", "0 5 1 1 \n", "1 5 4 4 \n", "2 3 1 1 \n", "3 6 8 8 \n", "4 4 1 1 \n", "\n", " Marginal Adhesion Single Epithelial Cell Size Bare Nuclei \\\n", "0 1 2 1 \n", "1 5 7 10 \n", "2 1 2 2 \n", "3 1 3 4 \n", "4 3 2 1 \n", "\n", " Bland Chromatin Normal Nucleoli Mitoses \n", "0 3 1 1 \n", "1 3 2 1 \n", "2 3 1 1 \n", "3 3 7 1 \n", "4 3 1 1 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2\n", "1 2\n", "2 2\n", "3 2\n", "4 2\n", "Name: Class, dtype: int64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test = train_test_split(x, y)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Clump ThicknessUniformity of Cell SizeUniformity of Cell ShapeMarginal AdhesionSingle Epithelial Cell SizeBare NucleiBland ChromatinNormal NucleoliMitoses
71610281027810
413512121311
159991036107106
371113121111
331511121312
\n", "
" ], "text/plain": [ " Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape \\\n", "71 6 10 2 \n", "413 5 1 2 \n", "159 9 9 10 \n", "371 1 1 3 \n", "331 5 1 1 \n", "\n", " Marginal Adhesion Single Epithelial Cell Size Bare Nuclei \\\n", "71 8 10 2 \n", "413 1 2 1 \n", "159 3 6 10 \n", "371 1 2 1 \n", "331 1 2 1 \n", "\n", " Bland Chromatin Normal Nucleoli Mitoses \n", "71 7 8 10 \n", "413 3 1 1 \n", "159 7 10 6 \n", "371 1 1 1 \n", "331 3 1 2 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# 4、标准化\n", "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/zwy/.virtualenvs/python4/lib/python3.6/site-packages/sklearn/preprocessing/data.py:617: DataConversionWarning: Data with input dtype int64, object were all converted to float64 by StandardScaler.\n", " return self.partial_fit(X, y)\n", "/Users/zwy/.virtualenvs/python4/lib/python3.6/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int64, object were all converted to float64 by StandardScaler.\n", " return self.fit(X, **fit_params).transform(X)\n", "/Users/zwy/.virtualenvs/python4/lib/python3.6/site-packages/ipykernel_launcher.py:3: DataConversionWarning: Data with input dtype int64, object were all converted to float64 by StandardScaler.\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] } ], "source": [ "transfer = StandardScaler()\n", "x_train = transfer.fit_transform(x_train)\n", "x_test = transfer.transform(x_test)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.55384961, 2.25850408, -0.39241988, ..., 1.48150122,\n", " 1.58884606, 4.5986847 ],\n", " [ 0.20071013, -0.68803805, -0.39241988, ..., -0.17196717,\n", " -0.61573167, -0.34779968],\n", " [ 1.61326804, 1.93111051, 2.29545943, ..., 1.48150122,\n", " 2.21872542, 2.4002472 ],\n", " ...,\n", " [-0.50556882, -0.68803805, -0.72840479, ..., -0.99870137,\n", " -0.61573167, -0.34779968],\n", " [ 0.20071013, 2.25850408, 2.29545943, ..., 0.65476703,\n", " 2.21872542, 0.75141907],\n", " [ 1.96640752, 0.29414266, 0.27954995, ..., 0.65476703,\n", " 0.01414768, 0.75141907]])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/zwy/.virtualenvs/python4/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n" ] }, { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='warn',\n", " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", " tol=0.0001, verbose=0, warm_start=False)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 5、预估器流程\n", "estimator = LogisticRegression()\n", "estimator.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.39074328, 0.24254804, 0.77224014, 0.74201473, 0.23764145,\n", " 1.04861389, 0.90381812, 0.53872577, 0.90103902]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 逻辑回归的模型参数:回归系数和偏置\n", "estimator.coef_" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-1.18507135])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "estimator.intercept_" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "y_predict:\n", " [2 2 2 2 4 4 2 2 2 2 4 4 2 4 2 2 4 2 4 4 2 2 4 4 2 4 2 2 4 2 2 2 2 4 2 4 4\n", " 4 2 2 4 2 4 4 2 2 2 4 4 2 2 2 2 2 2 4 4 2 2 2 4 4 4 2 4 2 4 2 2 2 2 2 4 2\n", " 4 4 2 2 4 2 2 2 2 2 2 4 4 4 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 2 2 2 2 2 4 2\n", " 2 4 4 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 4 2 4 4 2 4 2 2 4 4 4 4\n", " 2 4 2 2 4 4 2 2 2 4 2 2 2 4 4 2 2 2 2 2 2 4 2]\n", "直接比对真实值和预测值:\n", " 432 True\n", "256 True\n", "203 True\n", "281 True\n", "588 True\n", "590 True\n", "66 True\n", "473 True\n", "637 True\n", "537 True\n", "103 True\n", "334 True\n", "8 True\n", "44 True\n", "141 True\n", "90 True\n", "329 True\n", "454 True\n", "440 True\n", "200 True\n", "665 True\n", "495 True\n", "279 True\n", "272 True\n", "43 False\n", "282 True\n", "22 True\n", "548 True\n", "330 True\n", "654 True\n", " ... \n", "340 True\n", "470 True\n", "343 True\n", "253 True\n", "264 True\n", "262 True\n", "132 True\n", "51 False\n", "42 True\n", "250 True\n", "651 True\n", "232 False\n", "581 True\n", "94 True\n", "385 True\n", "566 True\n", "386 True\n", "234 True\n", "189 True\n", "556 True\n", "605 True\n", "112 True\n", "529 True\n", "345 True\n", "410 True\n", "327 True\n", "480 True\n", "350 True\n", "381 True\n", "545 True\n", "Name: Class, Length: 171, dtype: bool\n", "准确率为:\n", " 0.9649122807017544\n" ] } ], "source": [ "# 6、模型评估\n", "# 方法1:直接比对真实值和预测值\n", "y_predict = estimator.predict(x_test)\n", "print(\"y_predict:\\n\", y_predict)\n", "print(\"直接比对真实值和预测值:\\n\", y_test == y_predict)\n", "\n", "# 方法2:计算准确率\n", "score = estimator.score(x_test, y_test)\n", "print(\"准确率为:\\n\", score)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# 查看精确率、召回率、F1-score\n", "from sklearn.metrics import classification_report" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "report = classification_report(y_test, y_predict, labels=[2, 4], target_names=[\"良性\", \"恶性\"])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 良性 0.95 0.99 0.97 107\n", " 恶性 0.98 0.92 0.95 64\n", "\n", " micro avg 0.96 0.96 0.96 171\n", " macro avg 0.97 0.96 0.96 171\n", "weighted avg 0.97 0.96 0.96 171\n", "\n" ] } ], "source": [ "print(report)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "432 2\n", "256 2\n", "203 2\n", "281 2\n", "588 4\n", "Name: Class, dtype: int64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# y_true:每个样本的真实类别,必须为0(反例),1(正例)标记\n", "# 将y_test 转换成 0 1\n", "y_true = np.where(y_test > 3, 1, 0)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,\n", " 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,\n", " 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,\n", " 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,\n", " 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,\n", " 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n", " 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,\n", " 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_true" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import roc_auc_score" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9562646028037384" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "roc_auc_score(y_true, y_predict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }