{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# 1、获取数据\n", "# 2、合并表\n", "# 3、找到user_id和aisle之间的关系\n", "# 4、PCA降维" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# 1、获取数据\n", "order_products = pd.read_csv(\"./instacart/order_products__prior.csv\")\n", "products = pd.read_csv(\"./instacart/products.csv\")\n", "orders = pd.read_csv(\"./instacart/orders.csv\")\n", "aisles = pd.read_csv(\"./instacart/aisles.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# 2、合并表\n", "# order_products__prior.csv:订单与商品信息\n", "\n", "# 字段:order_id, product_id, add_to_cart_order, reordered\n", "# products.csv:商品信息\n", "# 字段:product_id, product_name, aisle_id, department_id\n", "# orders.csv:用户的订单信息\n", "# 字段:order_id,user_id,eval_set,order_number,….\n", "# aisles.csv:商品所属具体物品类别\n", "# 字段: aisle_id, aisle\n", "\n", "# 合并aisles和products aisle和product_id\n", "tab1 = pd.merge(aisles, products, on=[\"aisle_id\", \"aisle_id\"])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "tab2 = pd.merge(tab1, order_products, on=[\"product_id\", \"product_id\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "tab3 = pd.merge(tab2, orders, on=[\"order_id\", \"order_id\"])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
aisle_idaisleproduct_idproduct_namedepartment_idorder_idadd_to_cart_orderreordereduser_ideval_setorder_numberorder_doworder_hour_of_daydays_since_prior_order
01prepared soups salads209Italian Pasta Salad209424650114082prior260201.0
11prepared soups salads22853Pesto Pasta Salad209424640114082prior260201.0
24instant foods12087Chicken Flavor Ramen Noodle Soup994246150114082prior260201.0
34instant foods47570Original Flavor Macaroni & Cheese Dinner994246141114082prior260201.0
413prepared meals10089Dolmas2094246250114082prior260201.0
\n", "
" ], "text/plain": [ " aisle_id aisle product_id \\\n", "0 1 prepared soups salads 209 \n", "1 1 prepared soups salads 22853 \n", "2 4 instant foods 12087 \n", "3 4 instant foods 47570 \n", "4 13 prepared meals 10089 \n", "\n", " product_name department_id order_id \\\n", "0 Italian Pasta Salad 20 94246 \n", "1 Pesto Pasta Salad 20 94246 \n", "2 Chicken Flavor Ramen Noodle Soup 9 94246 \n", "3 Original Flavor Macaroni & Cheese Dinner 9 94246 \n", "4 Dolmas 20 94246 \n", "\n", " add_to_cart_order reordered user_id eval_set order_number order_dow \\\n", "0 5 0 114082 prior 26 0 \n", "1 4 0 114082 prior 26 0 \n", "2 15 0 114082 prior 26 0 \n", "3 14 1 114082 prior 26 0 \n", "4 25 0 114082 prior 26 0 \n", "\n", " order_hour_of_day days_since_prior_order \n", "0 20 1.0 \n", "1 20 1.0 \n", "2 20 1.0 \n", "3 20 1.0 \n", "4 20 1.0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tab3.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# 3、找到user_id和aisle之间的关系\n", "table = pd.crosstab(tab3[\"user_id\"], tab3[\"aisle\"])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "data = table[:10000]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# 4、PCA降维\n", "from sklearn.decomposition import PCA" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# 1)实例化一个转换器类\n", "transfer = PCA(n_components=0.95)\n", "\n", "# 2)调用fit_transform\n", "data_new = transfer.fit_transform(data)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10000, 42)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_new.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-2.36456828e+01, 2.30028678e+00, -2.71706275e+00, ...,\n", " 8.24685231e-01, -5.20365905e-01, 2.99847178e+00],\n", " [ 6.95477119e+00, 3.54966052e+01, 2.52655545e+00, ...,\n", " -1.15326520e+00, -1.37969318e+00, -1.07115466e-02],\n", " [-7.47792843e+00, 2.83147785e+00, -1.07306519e+01, ...,\n", " -3.55104796e-01, 3.85595697e-02, 1.92318882e+00],\n", " ...,\n", " [-2.86664024e+01, -1.26446961e+00, -1.14908062e+00, ...,\n", " 1.13859569e-03, 4.14189764e-01, 4.46163585e-01],\n", " [-2.88378748e+01, -1.70490822e+00, -1.47059942e+00, ...,\n", " -1.62743887e-01, 6.72795951e-01, 9.64403654e-02],\n", " [ 2.10412407e+02, -3.51935647e+01, 1.33671987e+01, ...,\n", " 1.46544596e+01, 1.56764794e+01, 1.67432890e+01]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_new" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# 预估器流程\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", " n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',\n", " random_state=None, tol=0.0001, verbose=0)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "estimator = KMeans(n_clusters=3)\n", "estimator.fit(data_new)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "y_predict = estimator.predict(data_new)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,\n", " 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,\n", " 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,\n", " 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 2, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2,\n", " 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,\n", " 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,\n", " 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0,\n", " 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0,\n", " 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0,\n", " 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2], dtype=int32)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_predict[:300]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# 模型评估-轮廓系数\n", "from sklearn.metrics import silhouette_score" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5396819903993841" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "silhouette_score(data_new, y_predict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }