From 831ae03e7333e553c3ae4ccfa127cca0a4df0163 Mon Sep 17 00:00:00 2001
From: "FIRELING-PC\\FIRELING" <lining0806@gmail.com>
Date: Sun, 22 Apr 2018 11:57:53 +0800
Subject: [PATCH 1/4] test

---
 ReadMe.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/ReadMe.md b/ReadMe.md
index 2d898a2e..c6882ff6 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -256,3 +256,45 @@ Scrapy是一个基于Twisted的开源的Python爬虫框架，在工业中应用
 相关内容可以参考[基于Scrapy网络爬虫的搭建](http://www.lining0806.com/%E5%9F%BA%E4%BA%8Escrapy%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E7%9A%84%E6%90%AD%E5%BB%BA/)，同时给出这篇文章介绍的[微信搜索](http://weixin.sogou.com/weixin)爬取的项目代码，给大家作为学习参考。
 
 参考项目：[使用Scrapy或Requests递归抓取微信搜索结果](https://github.com/lining0806/PythonSpiderNotes/blob/master/WechatSearchProjects)
+
+## Robots协议  
+
+好的网络爬虫，首先需要遵守**Robots协议**。Robots协议（也称为爬虫协议、机器人协议等）的全称是“网络爬虫排除标准”（Robots Exclusion Protocol），网站通过Robots协议告诉搜索引擎哪些页面可以抓取，哪些页面不能抓取。
+
+在网站根目录下放一个robots.txt文本文件（如 https://www.taobao.com/robots.txt ），里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面，指定的页面由正则表达式表示。网络爬虫在采集这个网站之前，首先获取到这个robots.txt文本文件，然后解析到其中的规则，然后根据规则来采集网站的数据。
+
+### Robots协议规则
+
+	User-agent: 指定对哪些爬虫生效
+	Disallow: 指定不允许访问的网址
+	Allow: 指定允许访问的网址
+	注意: 一个英文要大写，冒号是英文状态下，冒号后面有一个空格，"/"代表整个网站
+
+### Robots协议举例
+
+	禁止所有机器人访问
+		User-agent: *
+		Disallow: /
+	允许所有机器人访问
+		User-agent: *
+		Disallow: 
+	禁止特定机器人访问
+		User-agent: BadBot
+		Disallow: /
+	允许特定机器人访问
+		User-agent: GoodBot
+		Disallow: 
+	禁止访问特定目录
+		User-agent: *
+		Disallow: /images/
+	仅允许访问特定目录
+		User-agent: *
+		Allow: /images/
+		Disallow: /
+	禁止访问特定文件
+		User-agent: *
+		Disallow: /*.html$
+	仅允许访问特定文件
+		User-agent: *
+		Allow: /*.html$
+		Disallow: /
\ No newline at end of file

From bbe2a98bb4164c4af68ef10609def903116c72fb Mon Sep 17 00:00:00 2001
From: "FIRELING-PC\\FIRELING" <lining0806@gmail.com>
Date: Sun, 22 Apr 2018 12:12:41 +0800
Subject: [PATCH 2/4] test

---
 ReadMe.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ReadMe.md b/ReadMe.md
index c6882ff6..37fca5d8 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -268,7 +268,8 @@ Scrapy是一个基于Twisted的开源的Python爬虫框架，在工业中应用
 	User-agent: 指定对哪些爬虫生效
 	Disallow: 指定不允许访问的网址
 	Allow: 指定允许访问的网址
-	注意: 一个英文要大写，冒号是英文状态下，冒号后面有一个空格，"/"代表整个网站
+
+注意: 一个英文要大写，冒号是英文状态下，冒号后面有一个空格，"/"代表整个网站
 
 ### Robots协议举例
 

From 78fee6c50606ab8c8305ea0dcbe8f698b4c755f5 Mon Sep 17 00:00:00 2001
From: "FIRELING-PC\\FIRELING" <lining0806@gmail.com>
Date: Sun, 22 Apr 2018 12:16:16 +0800
Subject: [PATCH 3/4] test

---
 ReadMe.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ReadMe.md b/ReadMe.md
index 37fca5d8..abb10cb7 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -263,7 +263,7 @@ Scrapy是一个基于Twisted的开源的Python爬虫框架，在工业中应用
 
 在网站根目录下放一个robots.txt文本文件（如 https://www.taobao.com/robots.txt ），里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面，指定的页面由正则表达式表示。网络爬虫在采集这个网站之前，首先获取到这个robots.txt文本文件，然后解析到其中的规则，然后根据规则来采集网站的数据。
 
-### Robots协议规则
+### 1. Robots协议规则
 
 	User-agent: 指定对哪些爬虫生效
 	Disallow: 指定不允许访问的网址
@@ -271,7 +271,7 @@ Scrapy是一个基于Twisted的开源的Python爬虫框架，在工业中应用
 
 注意: 一个英文要大写，冒号是英文状态下，冒号后面有一个空格，"/"代表整个网站
 
-### Robots协议举例
+### 2. Robots协议举例
 
 	禁止所有机器人访问
 		User-agent: *

From da645036061fcdcd43ecfd16a9980a958c023160 Mon Sep 17 00:00:00 2001
From: lining <lining@kuaishou.com>
Date: Mon, 21 Jun 2021 10:47:47 +0800
Subject: [PATCH 4/4] update

---
 WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py b/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py
index 7bf76f23..1c93d38a 100644
--- a/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py
+++ b/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py
@@ -18,7 +18,7 @@
 #                                             host = "localhost",
 #                                             db = "testwechat", # you must build database named testwechat
 #                                             user = "root",
-#                                             passwd = "fireling",
+#                                             passwd = "testpasswd",
 #                                             charset = "utf8")
 #     # pipeline default function
 #     def process_item(self, item, spider):
@@ -37,7 +37,7 @@ class WechatprojectPipeline(object):
     def __init__(self):
         connection = pymongo.Connection(host = "localhost", port = 27017)
         db = connection["testwechat"] # you need no build database named testdouban
-        # db.authenticate(name = "root", password = "fireling") # no name and password for localhost
+        # db.authenticate(name = "root", password = "testpasswd") # no name and password for localhost
         self.posts = db["result"] # you need not build collection named book
     # pipeline default function
     def process_item(self, item, spider):