{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Python 正则表达 RegEx" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 导入模块" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 简单 Python 匹配" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "False\n" ] } ], "source": [ "# matching string\n", "pattern1 = \"cat\"\n", "pattern2 = \"bird\"\n", "string = \"dog runs to cat\"\n", "print(pattern1 in string) \n", "print(pattern2 in string) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 用正则寻找配对" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(12, 15), match='cat'>\n", "None\n" ] } ], "source": [ "# regular expression\n", "pattern1 = \"cat\"\n", "pattern2 = \"bird\"\n", "string = \"dog runs to cat\"\n", "print(re.search(pattern1, string)) \n", "print(re.search(pattern2, string)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 匹配多种可能 使用 []" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(4, 7), match='run'>\n" ] } ], "source": [ "# multiple patterns (\"run\" or \"ran\")\n", "ptn = r\"r[au]n\" \n", "print(re.search(ptn, \"dog runs to cat\")) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 匹配更多种可能" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n", "<_sre.SRE_Match object; span=(4, 7), match='run'>\n", "<_sre.SRE_Match object; span=(4, 7), match='r2n'>\n", "<_sre.SRE_Match object; span=(4, 7), match='run'>\n" ] } ], "source": [ "# continue\n", "print(re.search(r\"r[A-Z]n\", \"dog runs to cat\")) \n", "print(re.search(r\"r[a-z]n\", \"dog runs to cat\")) \n", "print(re.search(r\"r[0-9]n\", \"dog r2ns to cat\")) \n", "print(re.search(r\"r[0-9a-z]n\", \"dog runs to cat\")) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 特殊种类匹配" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 数字" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(4, 7), match='r4n'>\n", "<_sre.SRE_Match object; span=(0, 3), match='run'>\n" ] } ], "source": [ "# \\d : decimal digit\n", "print(re.search(r\"r\\dn\", \"run r4n\")) \n", "# \\D : any non-decimal digit\n", "print(re.search(r\"r\\Dn\", \"run r4n\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 空白" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 3), match='r\\nn'>\n", "<_sre.SRE_Match object; span=(4, 7), match='r4n'>\n" ] } ], "source": [ "# \\s : any white space [\\t\\n\\r\\f\\v]\n", "print(re.search(r\"r\\sn\", \"r\\nn r4n\")) \n", "# \\S : opposite to \\s, any non-white space\n", "print(re.search(r\"r\\Sn\", \"r\\nn r4n\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 所有字母数字和\"_\"" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(4, 7), match='r4n'>\n", "<_sre.SRE_Match object; span=(0, 3), match='r\\nn'>\n" ] } ], "source": [ "# \\w : [a-zA-Z0-9_]\n", "print(re.search(r\"r\\wn\", \"r\\nn r4n\")) \n", "# \\W : opposite to \\w\n", "print(re.search(r\"r\\Wn\", \"r\\nn r4n\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 空白字符" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(4, 8), match='runs'>\n", "<_sre.SRE_Match object; span=(5, 11), match=' runs '>\n" ] } ], "source": [ "# \\b : empty string (only at the start or end of the word)\n", "print(re.search(r\"\\bruns\\b\", \"dog runs to cat\")) \n", "# \\B : empty string (but not at the start or end of a word)\n", "print(re.search(r\"\\B runs \\B\", \"dog runs to cat\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 特殊字符 任意字符" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 5), match='runs\\\\'>\n", "<_sre.SRE_Match object; span=(0, 3), match='r[n'>\n" ] } ], "source": [ "# \\\\ : match \\\n", "print(re.search(r\"runs\\\\\", \"runs\\ to me\")) \n", "# . : match anything (except \\n)\n", "print(re.search(r\"r.n\", \"r[ns to me\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 句尾句首" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 3), match='dog'>\n", "<_sre.SRE_Match object; span=(12, 15), match='cat'>\n" ] } ], "source": [ "# ^ : match line beginning\n", "print(re.search(r\"^dog\", \"dog runs to cat\")) \n", "# $ : match line ending\n", "print(re.search(r\"cat$\", \"dog runs to cat\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 是否" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 6), match='Monday'>\n", "<_sre.SRE_Match object; span=(0, 3), match='Mon'>\n" ] } ], "source": [ "# ? : may or may not occur\n", "print(re.search(r\"Mon(day)?\", \"Monday\")) \n", "print(re.search(r\"Mon(day)?\", \"Mon\")) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 多行匹配" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n", "<_sre.SRE_Match object; span=(18, 19), match='I'>\n" ] } ], "source": [ "# multi-line\n", "string = \"\"\"\n", "dog runs to cat.\n", "I run to dog.\n", "\"\"\"\n", "print(re.search(r\"^I\", string)) \n", "print(re.search(r\"^I\", string, flags=re.M)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0或多次" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 1), match='a'>\n", "<_sre.SRE_Match object; span=(0, 6), match='abbbbb'>\n" ] } ], "source": [ "# * : occur 0 or more times\n", "print(re.search(r\"ab*\", \"a\")) \n", "print(re.search(r\"ab*\", \"abbbbb\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1或多次" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n", "<_sre.SRE_Match object; span=(0, 6), match='abbbbb'>\n" ] } ], "source": [ "# + : occur 1 or more times\n", "print(re.search(r\"ab+\", \"a\")) \n", "print(re.search(r\"ab+\", \"abbbbb\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 可选次数" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n", "<_sre.SRE_Match object; span=(0, 6), match='abbbbb'>\n" ] } ], "source": [ "# {n, m} : occur n to m times\n", "print(re.search(r\"ab{2,10}\", \"a\")) \n", "print(re.search(r\"ab{2,10}\", \"abbbbb\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## group 组" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "021523, Date: Feb/12/2017\n", "021523\n", "Feb/12/2017\n" ] } ], "source": [ "# group\n", "match = re.search(r\"(\\d+), Date: (.+)\", \"ID: 021523, Date: Feb/12/2017\")\n", "print(match.group()) \n", "print(match.group(1)) \n", "print(match.group(2)) " ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "021523\n", "Feb/12/2017\n" ] } ], "source": [ "match = re.search(r\"(?P\\d+), Date: (?P.+)\", \"ID: 021523, Date: Feb/12/2017\")\n", "print(match.group('id')) \n", "print(match.group('date')) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 寻找所有匹配 " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['run', 'ran']\n" ] } ], "source": [ "# findall\n", "print(re.findall(r\"r[ua]n\", \"run ran ren\")) " ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['run', 'ran']\n" ] } ], "source": [ "# | : or\n", "print(re.findall(r\"(run|ran)\", \"run ran ren\")) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 替换" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dog catches to cat\n" ] } ], "source": [ "# re.sub() replace\n", "print(re.sub(r\"r[au]ns\", \"catches\", \"dog runs to cat\")) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 分裂" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c', 'd', 'e']\n" ] } ], "source": [ "# re.split()\n", "print(re.split(r\"[,;\\.]\", \"a;b,c.d;e\")) \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## compile" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(4, 7), match='ran'>\n" ] } ], "source": [ "# compile\n", "compiled_re = re.compile(r\"r[ua]n\")\n", "print(compiled_re.search(\"dog ran to cat\")) " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 1 }