Skip to content

Commit 96bedd4

Browse files
authored
first spider
first spider using scrapy
1 parent f3ba7fb commit 96bedd4

File tree

15 files changed

+318
-0
lines changed

15 files changed

+318
-0
lines changed
128 Bytes
Binary file not shown.
449 Bytes
Binary file not shown.
1.09 KB
Binary file not shown.
295 Bytes
Binary file not shown.
6.11 MB
Binary file not shown.

‎first_spider/firstspider/items.py‎

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/items.html
7+
8+
importscrapy
9+
10+
11+
classFirstspiderItem(scrapy.Item):
12+
# define the fields for your item here like:
13+
# name = scrapy.Field()
14+
positionName=scrapy.Field()
15+
16+
rate=scrapy.Field()
17+
18+
companyName=scrapy.Field()
19+
20+
salary=scrapy.Field()
21+
22+
workLocation=scrapy.Field()
23+
24+
publishTime=scrapy.Field()
25+
26+
27+
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
fromscrapyimportsignals
9+
10+
11+
classFirstspiderSpiderMiddleware(object):
12+
# Not all methods need to be defined. If a method is not defined,
13+
# scrapy acts as if the spider middleware does not modify the
14+
# passed objects.
15+
16+
@classmethod
17+
deffrom_crawler(cls, crawler):
18+
# This method is used by Scrapy to create your spiders.
19+
s=cls()
20+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21+
returns
22+
23+
defprocess_spider_input(self, response, spider):
24+
# Called for each response that goes through the spider
25+
# middleware and into the spider.
26+
27+
# Should return None or raise an exception.
28+
returnNone
29+
30+
defprocess_spider_output(self, response, result, spider):
31+
# Called with the results returned from the Spider, after
32+
# it has processed the response.
33+
34+
# Must return an iterable of Request, dict or Item objects.
35+
foriinresult:
36+
yieldi
37+
38+
defprocess_spider_exception(self, response, exception, spider):
39+
# Called when a spider or process_spider_input() method
40+
# (from other spider middleware) raises an exception.
41+
42+
# Should return either None or an iterable of Response, dict
43+
# or Item objects.
44+
pass
45+
46+
defprocess_start_requests(self, start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
forrinstart_requests:
53+
yieldr
54+
55+
defspider_opened(self, spider):
56+
spider.logger.info('Spider opened: %s'%spider.name)
57+
58+
59+
classFirstspiderDownloaderMiddleware(object):
60+
# Not all methods need to be defined. If a method is not defined,
61+
# scrapy acts as if the downloader middleware does not modify the
62+
# passed objects.
63+
64+
@classmethod
65+
deffrom_crawler(cls, crawler):
66+
# This method is used by Scrapy to create your spiders.
67+
s=cls()
68+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69+
returns
70+
71+
defprocess_request(self, request, spider):
72+
# Called for each request that goes through the downloader
73+
# middleware.
74+
75+
# Must either:
76+
# - return None: continue processing this request
77+
# - or return a Response object
78+
# - or return a Request object
79+
# - or raise IgnoreRequest: process_exception() methods of
80+
# installed downloader middleware will be called
81+
returnNone
82+
83+
defprocess_response(self, request, response, spider):
84+
# Called with the response returned from the downloader.
85+
86+
# Must either;
87+
# - return a Response object
88+
# - return a Request object
89+
# - or raise IgnoreRequest
90+
returnresponse
91+
92+
defprocess_exception(self, request, exception, spider):
93+
# Called when a download handler or a process_request()
94+
# (from other downloader middleware) raises an exception.
95+
96+
# Must either:
97+
# - return None: continue processing this exception
98+
# - return a Response object: stops process_exception() chain
99+
# - return a Request object: stops process_exception() chain
100+
pass
101+
102+
defspider_opened(self, spider):
103+
spider.logger.info('Spider opened: %s'%spider.name)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
importcsv
8+
importjson
9+
10+
11+
classFirstspiderPipeline(object):
12+
def__init__(self):
13+
self.f=open("data.csv",mode="w",newline="")
14+
self.wr=csv.writer(self.f)
15+
self.wr.writerow(["职位名称","反馈率","公司名称","薪资","工作地点","发布时间"])
16+
17+
defprocess_item(self, item, spider):
18+
# data=json.dumps(dict(item),ensure_ascii=False)+",\n"
19+
# data=dict(item)
20+
# print(data)
21+
data=[item["positionName"],item["rate"],item["companyName"],item["salary"],item["workLocation"],item["publishTime"]]
22+
self.wr.writerow(data)
23+
returnitem
24+
25+
defclose_spider(self,spider):
26+
self.f.close()
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for firstspider project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# https://doc.scrapy.org/en/latest/topics/settings.html
9+
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME='firstspider'
13+
14+
SPIDER_MODULES= ['firstspider.spiders']
15+
NEWSPIDER_MODULE='firstspider.spiders'
16+
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
#USER_AGENT = 'firstspider (+http://www.yourdomain.com)'
20+
21+
# Obey robots.txt rules
22+
#是否遵循robot协议,如果要爬取数据,需要将其设置为False或者注释掉
23+
#ROBOTSTXT_OBEY = True
24+
25+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
26+
#CONCURRENT_REQUESTS = 32
27+
28+
# Configure a delay for requests for the same website (default: 0)
29+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
30+
# See also autothrottle settings and docs
31+
#DOWNLOAD_DELAY = 3
32+
# The download delay setting will honor only one of:
33+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
34+
#CONCURRENT_REQUESTS_PER_IP = 16
35+
36+
# Disable cookies (enabled by default)
37+
#COOKIES_ENABLED = False
38+
39+
# Disable Telnet Console (enabled by default)
40+
#TELNETCONSOLE_ENABLED = False
41+
42+
# Override the default request headers:
43+
#DEFAULT_REQUEST_HEADERS ={
44+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45+
# 'Accept-Language': 'en',
46+
#}
47+
48+
# Enable or disable spider middlewares
49+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
50+
#SPIDER_MIDDLEWARES ={
51+
#爬虫中间件,可以有多个,后面的数字代表优先级,数字越小,优先级越高
52+
# 'firstspider.middlewares.FirstspiderSpiderMiddleware': 543,
53+
#}
54+
55+
# Enable or disable downloader middlewares
56+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57+
#DOWNLOADER_MIDDLEWARES ={
58+
# 'firstspider.middlewares.FirstspiderDownloaderMiddleware': 543,
59+
#}
60+
61+
# Enable or disable extensions
62+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
63+
#EXTENSIONS ={
64+
# 'scrapy.extensions.telnet.TelnetConsole': None,
65+
#}
66+
67+
# Configure item pipelines
68+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69+
ITEM_PIPELINES={
70+
'firstspider.pipelines.FirstspiderPipeline': 300,
71+
}
72+
73+
# Enable and configure the AutoThrottle extension (disabled by default)
74+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75+
#AUTOTHROTTLE_ENABLED = True
76+
# The initial download delay
77+
#AUTOTHROTTLE_START_DELAY = 5
78+
# The maximum download delay to be set in case of high latencies
79+
#AUTOTHROTTLE_MAX_DELAY = 60
80+
# The average number of requests Scrapy should be sending in parallel to
81+
# each remote server
82+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83+
# Enable showing throttling stats for every response received:
84+
#AUTOTHROTTLE_DEBUG = False
85+
86+
# Enable and configure HTTP caching (disabled by default)
87+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88+
#HTTPCACHE_ENABLED = True
89+
#HTTPCACHE_EXPIRATION_SECS = 0
90+
#HTTPCACHE_DIR = 'httpcache'
91+
#HTTPCACHE_IGNORE_HTTP_CODES = []
92+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

0 commit comments

Comments
(0)