服务器之家

服务器之家 > 正文

Scrapy项目实战之爬取某社区用户详情

时间:2020-09-17 13:47     来源/作者:hankleo

本文介绍了Scrapy项目实战之爬取某社区用户详情,分享给大家,具有如下:

get_cookies.py

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings
 
class GetCookies(object):
 def __init__(self):
  # 初始化组件
  # 设定webdriver选项
  self.opt = webdriver.ChromeOptions()
  # self.opt.add_argument("--headless")
  # 初始化用户列表
  self.user_list = settings.USER_LIST
  # 初始化MongoDB参数
  self.client = MongoClient(settings.MONGO_URI)
  self.db = self.client[settings.MONGO_DB]
  self.collection = self.db["cookies"]
 
 def get_cookies(self,username,password):
  """
 
  :param username:
  :param password:
  :return: cookies
  """
  # 使用webdriver选项创建driver
  driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
  driver.get("https://segmentfault.com/user/login")
  driver.find_element_by_name("username").send_keys(username)
  driver.find_element_by_name("password").send_keys(password)
  driver.find_element_by_xpath("//button[@type='submit']").click()
  time.sleep(2)
  driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
  # 登陆之后获取页面cookies
  cookies = driver.get_cookies()
  driver.quit()
 
  return cookies
 
 def format_cookies(self,cookies):
  """
 
  :param cookies:
  从driver.get_cookies的形式为:
  [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
  'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
  {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
  'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
  'value': '1550066940'},
  {'domain': '.segmentfault.com', 'httpOnly': False,
  'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
  'path': '/', 'secure': False, 'value': '1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
  'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
  {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
  'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
  {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
  'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
  只需提取每一项的name与value即可
 
  :return:
  """
  c = dict()
  for item in cookies:
   c[item['name']] = item['value']
 
  return c
 
 def save(self):
  print("开始获取Cookies....")
  # 从用户列表中获取用户名与密码,分别登陆获取cookies
  for username,password in self.user_list:
   cookies = self.get_cookies(username,password)
   f_cookies = self.format_cookies(cookies)
   print("insert cookie:{}".format(f_cookies))
   # 将格式整理后的cookies插入MongoDB数据库
   self.collection.insert_one(f_cookies)
 
  # s = db[self.collection].find()
  # for i in s:
  #  print(i)
 
 
if __name__ == '__main__':
 
 cookies = GetCookies()
 for i in range(20):
  cookies.save()

item.py

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
 
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
 
import scrapy
 
 
class SegmentfaultItem(scrapy.Item):
 # define the fields for your item here like:
 # 个人属性
 # 姓名
 name = scrapy.Field()
 # 声望
 rank = scrapy.Field()
 # 学校
 school = scrapy.Field()
 # 专业
 majors = scrapy.Field()
 # 公司
 company = scrapy.Field()
 # 工作
 job = scrapy.Field()
 # blog
 blog = scrapy.Field()
 # 社交活动数据
 # 关注人数
 following = scrapy.Field()
 # 粉丝数
 fans = scrapy.Field()
 # 回答数
 answers = scrapy.Field()
 # 提问数
 questions = scrapy.Field()
 # 文章数
 articles = scrapy.Field()
 # 讲座数
 lives = scrapy.Field()
 # 徽章数
 badges = scrapy.Field()
 # 技能属性
 # 点赞数
 like = scrapy.Field()
 # 技能
 skills = scrapy.Field()
 # 注册日期
 register_date = scrapy.Field()
 # 问答统计
 # 回答最高得票数
 answers_top_score = scrapy.Field()
 # 得票数最高的回答对应的问题的标题
 answers_top_title = scrapy.Field()
 # 得票数最高的回答对应的问题的标签
 answers_top_tags = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_question = scrapy.Field()
 # 得票数最高的回答对应的问题的内容
 answers_top_content = scrapy.Field()

pipeline.py

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
 
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
 
class SegmentfaultPipeline(object):
 # 设定MongoDB集合名称
 collection_name = 'userinfo'
 
 def __init__(self,mongo_uri,mongo_db):
  self.mongo_uri = mongo_uri
  self.mongo_db = mongo_db
 
 # 通过crawler获取settings.py中设定的MongoDB连接信息
 @classmethod
 def from_crawler(cls,crawler):
  return cls(
   mongo_uri = crawler.settings.get('MONGO_URI'),
   mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
  )
 
 # 当爬虫启动时连接MongoDB
 def open_spider(self,spider):
  self.client = pymongo.MongoClient(self.mongo_uri)
  self.db = self.client[self.mongo_db]
 
 # 当爬虫关闭时断开MongoDB连接
 def close_spider(self,spider):
  self.client.close()
 
 # 将Item插入数据库保存
 def process_item(self, item, spider):
  self.db[self.collection_name].insert_one(dict(item))
  return item

settings.py

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
 
# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#  https://doc.scrapy.org/en/latest/topics/settings.html
#  https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#  https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 
BOT_NAME = 'segmentfault'
 
SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'
 
 
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
 
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
 
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100
 
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32
 
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
 
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
 
RETRY_ENABLED = False
 
REDIRECT_ENABLED = False
 
DOWNLOAD_TIMEOUT = 5
 
# HTTPALLOW
 
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
 
 
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
 'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}
 
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
 # 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
 'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
 'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 # 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,
 
}
 
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
 
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 'segmentfault.pipelines.SegmentfaultPipeline': 300,
}
 
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
 
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'
 
# 用户列表
USER_LIST = [
 ("798549150@qq.com","guoqing1010"),
 ("learnscrapy@163.com","guoqing1010"),
]
 
# 配置代理列表
PROXY_LIST = [
 'http://115.182.212.169:8080',
 'http://121.61.25.149:9999',
 'http://180.118.247.189:9000',
 'http://115.151.3.12:9999',
 'http://183.154.213.160:9000',
 'http://113.128.9.106:9999',
 'http://124.42.68.152:90',
 'http://49.70.48.50:9999',
 'http://113.128.11.172:9999',
 'http://111.177.177.40:9999',
 'http://59.62.83.253:9999',
 'http://39.107.84.185:8123',
 'http://124.94.195.107:9999',
 'http://111.177.160.132:9999',
 'http://120.25.203.182:7777'
]
 
USER_AGENT_LIST = [
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
 'Opera/8.0 (Windows NT 5.1; U; en)',
 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]

userinfo.py

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem
 
 
class UserinfoSpider(CrawlSpider):
 name = 'userinfo'
 allowed_domains = ['segmentfault.com']
 start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']
 
 rules = (
  # 用户主页地址,跟进并进行解析
  Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
  # 用户关注列表,跟进列表页面,抓取用户主页地址进行后续操作
  # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
  # 用户粉丝列表,跟进列表页面,抓取用户主页地址进行后续操作
  Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
  # 跟进其他页面地址
  # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
 )
 
 def start_requests(self):
  # 从MongoDB中获取一条cookie,添加到开始方法
  client = MongoClient(self.crawler.settings['MONGO_URI'])
  db = client[self.crawler.settings['MONGO_DB']]
  cookies_collection = db.cookies
  # 获取一条cookie
  cookies = cookies_collection.find_one()
  # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法,因此重新填充
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
 
  return [Request("https://segmentfault.com",
      cookies=cookies,
      meta={'cookiejar':1},
      callback=self.after_login)]
 
 # 登录之后从start_url中开始抓取数据
 def after_login(self,response):
  for url in self.start_urls:
   return self.make_requests_from_url(url)
 # def after_login(self,response):
 #  yield Request(self.start_urls[0],
 #     meta={'cookiejar':response.meta['cookiejar']},
 #     callback=self.parse_item)
 
 def parse_item(self, response):
  """
  :param response:
  :return:
  """
  item = SegmentfaultItem()
  # 个人属性模块
  profile_head = response.css('.profile__heading')
  # 姓名
  item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')
  # 声望
  item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
  # 学校专业信息
  school_info = profile_head.css('.profile__school::text').extract()
  if school_info:
   # 学校
   item['school'] = school_info[0]
   # 专业
   item['majors'] = school_info[1].strip()
  else:
   item['school'] = ''
   item['majors'] = ''
  # 公司职位信息
  company_info = profile_head.css('.profile__company::text').extract()
  if company_info:
   # 公司
   item['company'] = company_info[0]
   # 职位
   item['job'] = company_info[1].strip()
  else:
   item['company'] = ''
   item['job'] = ''
  # 个人博客
  item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()
 
  # 统计面板模块
  profile_active = response.xpath("//div[@class='col-md-2']")
  # 关注人数
  item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]
  # 粉丝人数
  item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]
  # 回答问题数
  item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
  # 提问数
  item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
  # 文章数
  item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
  # 讲座数
  item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
  # 徽章数
  item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
  # 徽章详细页面地址
  badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()
 
  # 技能面板模块
  profile_skill = response.xpath("//div[@class='col-md-3']")
  # 技能标签列表
  item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
  # 获得的点赞数
  item['like'] = profile_skill.css('.authlist').re_first(r'获得 (\d+) 次点赞')
  # 注册日期
  item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
  # if register_time:
  #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))
  # else:
  #  item['register_date'] = ''
 
  # 产出数据模块
  profile_work = response.xpath("//div[@class='col-md-7']")
  # 回答获得的最高分
  item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
  # 最高分回答对应的问题的标题
  item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
  # 最高分回答对应的问题的url
  answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()
 
  # 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据
  request = scrapy.Request(
   # 问题详细页url
   url=response.urljoin(answer_url),
   meta={
   # item需要传递
   'item':item,
   # 徽章的url
   'badge_url':response.urljoin(badge_url)},
   # 调用parse_ansser继续处理
   callback=self.parse_answer)
  yield request
 
 def parse_answer(self,response):
  # 取出传递的item
  item = response.meta['item']
  # 取出传递的徽章详细页url
  badge_url = response.meta['badge_url']
  # 问题标签列表
  item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
  # 先获取组成问题内容的字符串列表
  question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_question'] = ''.join(question_content)
  # 先获取组成答案的字符串列表
  answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
  # 拼接后传入item
  item['answers_top_content'] = ''.join(answer_content)
 
  # 问题页面内容抓取后继续抓取徽章页内容,并将更新后的item继续传递
  request = scrapy.Request(url=badge_url,
         meta={'item':item},
         callback=self.parse_badge)
  yield request
 
 def parse_badge(self,response):
  item = response.meta['item']
  badge_name = response.css('span.badge span::text').extract()
  badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
  name_count = {}
  for i in range(len(badge_count)):
   name_count[badge_name[i]] = badge_count[i]
  item['badges'] = name_count
  yield item

middlewars.py

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
 
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)
 
 
class SegmentfaultSpiderMiddleware(object):
 """
 处理Item中保存的三种类型注册日期数据:
 1. 注册于 2015年12月12日
 2. 注册于 3 天前
 3. 注册于 5 小时前
 """
 
 def process_spider_output(self,response,result,spider):
 
  """
  输出response时调用此方法处理item中register_date
  :param response:
  :param result: 包含item
  :param spider:
  :return:处理过注册日期的item
  """
  for item in result:
   # 判断获取的数据是否是scrapy.item类型
   if isinstance(item,scrapy.Item):
    # 获取当前时间
    now = datetime.datetime.now()
    register_date = item['register_date']
    logger.info("获取注册日志格式为{}".format(register_date))
    # 提取注册日期字符串,如'注册于2015年12月12日' => '20151212'
    day = ''.join(re.findall(r'\d+',register_date))
    # 如果提取数字字符串长度大于4位,则为'注册于2015年12月12日'形式
    if len(day) > 4:
     date = day
    # 如果‘时'在提取的字符串中,则为'注册于8小时前'形式
    elif '时' in register_date:
     d = now - datetime.timedelta(hours=int(day))
     date = d.strftime("%Y%m%d")
    # 最后一种情况就是'注册于3天前'形式
    else:
     d = now - datetime.timedelta(days=int(day))
     date = d.strftime("%Y%m%d")
 
    # 更新register_date值
    item['register_date'] = date
   yield item
 
 
class SegmentfaultHttpProxyMiddleware(object):
 # Not all methods need to be defined. If a method is not defined,
 # scrapy acts as if the downloader middleware does not modify the
 # passed objects.
 def __init__(self):
  self.proxy_list = settings['PROXY_LIST']
 
 def process_request(self, request, spider):
  proxy = random.choice(self.proxy_list)
  logger.info('使用代理:{}'.format(proxy))
  request.meta['proxy'] = proxy
 
 
class SegmentfaultUserAgentMiddleware(object):
 def __init__(self):
  self.useragent_list = settings['USER_AGENT_LIST']
 
 def process_request(self,request,spider):
  user_agent = random.choice(self.useragent_list)
 
  # logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
  request.headers['User-Agent'] = user_agent
 
 
 
class SegmentfaultCookiesMiddleware(object):
 client = MongoClient(settings['MONGO_URI'])
 db = client[settings['MONGO_DB']]
 collection = db['cookies']
 
 def get_cookies(self):
  """
  随机获取cookies
  :return:
  """
  cookies = random.choice([cookie for cookie in self.collection.find()])
  # 将不需要的"_id"与"_gat"参数删除
  cookies.pop('_id')
  cookies.pop('_gat')
  # 将"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间
  cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
  return cookies
 
 def remove_cookies(self,cookies):
  """
  删除已失效的cookies
  :param cookies:
  :return:
  """
  # 随机获取cookies中的一对键值,返回结果是一个元祖
  i = cookies.popitem()
  # 删除cookies
  try:
   logger.info("删除cookies{}".format(cookies))
   self.collection.remove({i[0]:i[1]})
  except Exception as e:
   logger.info("No this cookies:{}".format(cookies))
 
 def process_request(self,request,spider):
  """
  为每一个request添加一个cookie
  :param request:
  :param spider:
  :return:
  """
  cookies = self.get_cookies()
  request.cookies = cookies
 
 def process_response(self,request,response,spider):
  """
  对于登录失效的情况,可能会重定向到登录页面,这时添加新的cookies继续,将请求放回调度器
  :param request:
  :param response:
  :param spider:
  :return:
  """
  if response.status in [301,302]:
   logger.info("Redirect response:{}".format(response))
   redirect_url = response.headers['location']
   if b'/user/login' in redirect_url:
    logger.info("Cookies失效")
 
    # 请求失败,重新获取一个cookie,添加到request,并停止后续中间件处理此request,将此request放入调度器
    new_cookie = self.get_cookies()
    logger.info("获取新cookie:{}".format(new_cookie))
    # 删除旧cookies
    self.remove_cookies(request.cookies)
    request.cookies = new_cookie
   return request
  #
  return response

run.py

?
1
2
3
4
5
6
7
8
9
10
11
from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies
 
if __name__ == '__main__':
 cookies = GetCookies()
 cookies.save()
 name = 'userinfo'
 ""
 cmd = 'scrapy crawl {}'.format(name)
 cmdline.execute(cmd.split())

到此这篇关于Scrapy项目实战之爬取某社区用户详情的文章就介绍到这了,更多相关Scrapy 爬取某社区用户内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!

原文链接:https://www.cnblogs.com/hankleo/p/12994207.html

标签:

相关文章

热门资讯

2020微信伤感网名听哭了 让对方看到心疼的伤感网名大全
2020微信伤感网名听哭了 让对方看到心疼的伤感网名大全 2019-12-26
Intellij idea2020永久破解,亲测可用!!!
Intellij idea2020永久破解,亲测可用!!! 2020-07-29
歪歪漫画vip账号共享2020_yy漫画免费账号密码共享
歪歪漫画vip账号共享2020_yy漫画免费账号密码共享 2020-04-07
电视剧《琉璃》全集在线观看 琉璃美人煞1-59集免费观看地址
电视剧《琉璃》全集在线观看 琉璃美人煞1-59集免费观看地址 2020-08-12
最新idea2020注册码永久激活(激活到2100年)
最新idea2020注册码永久激活(激活到2100年) 2020-07-29
返回顶部