目录
1.介绍
2.代码
1.main.py
2.PageSider.py
3.DetailSpider.py
4.DataParse.py
5.Constant.py
6.HanderRequest.py
1.介绍
1. 使用多线程爬取网站
2.爬取数据后保存至excel
3.爬取网站(仅做测试)网创类项目爬取:https://www.maomp.***/
4..实现效果
2.代码
1.main.py
# coding:utf-8
import threading
import requests
from queue import Queue
from PageSpider import PageSpider
from DetailSpider import DetailSpider
from DataParse import DataParse
import xlsxwriter
import time
"""
爬取网站:https://www.maomp.***/wzjc/
爬取信息,保存至Excel
"""
def start_page(threadsize,page_queue,detail_queue):
# 开启线程,开始采集page页面
page_spider_threadsize = threadsize
page_spider_list = []
for i in range(1,page_spider_threadsize+1):
pageSpiderThread = PageSpider(thread_name="页面采集线程"+str(i), page_queue=page_queue, detail_queue=detail_queue)
# 启动线程
pageSpiderThread.start()
page_spider_list.append(pageSpiderThread)
# 查看队列是否有数据
while not page_queue:
pass
# 释放资源
for page_spider in page_spider_list:
if page_spider.is_alive():
page_spider.join()
def start_detail(threadsize,detail_queue,data_queue):
# 开启线程,开始采集page页面
detail_spider_threadsize = threadsize
detail_spider_list = []
for i in range(1, detail_spider_threadsize + 1):
detailSpiderThread = DetailSpider(thread_name="详情页采集线程" + str(i), detail_queue=detail_queue,
data_queue=data_queue)
# 启动线程
detailSpiderThread.start()
detail_spider_list.append(detailSpiderThread)
# 查看队列是否有数据
while not detail_queue:
pass
# 释放资源
for detail_spider in detail_spider_list:
if detail_spider.is_alive():
detail_spider.join()
def start_data_parse(threadsize,data_queue,book):
# 开启线程,开始采集page页面
lock=threading.Lock()
sheet1 = book.add_worksheet("sheet1")
title_data = ("网址", "标题", "发布时间", "内容")
# 添加表头
for index, title_datum in enumerate(title_data):
sheet1.write(0, index, title_datum)
spider_list = []
for i in range(1, threadsize + 1):
thread = DataParse(thread_name="数据解析线程" + str(i), data_queue=data_queue,lock=lock,sheet=sheet1)
# 启动线程
thread.start()
spider_list.append(thread)
# 查看队列是否有数据
while not data_queue:
pass
# 释放资源
for parse in spider_list:
if parse.is_alive():
parse.join()
def main(xlswriter=None):
#定义页面队列,存放page页信息
page_queue = Queue()
#定义详情页队列
detail_queue = Queue()
#定义详情页数据队列
data_queue = Queue()
page_start=1
page_end=1
for i in range(page_start,page_end+1):
page_url="https://www.maomp.***/wzjc/page/{}/".format(i)
page_queue.put(page_url)
print("页面队列:",page_queue.qsize())
#启动采集分页
start_page(threadsize=3,page_queue=page_queue,detail_queue=detail_queue)
#启动详情页采集
start_detail(threadsize=3, detail_queue=detail_queue, data_queue=data_queue)
# 启动数据解析
#创建存放excel文件夹
book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S",time.gmtime())+"文件.xlsx")
start_data_parse(threadsize=5,data_queue=data_queue,book=book)
book.close()
print("分页数据个数:",page_queue.qsize())
print("详情页数据个数:", detail_queue.qsize())
print("数据数据个数:", data_queue.qsize())
if __name__ == '__main__':
main()
2.PageSider.py
# coding:utf-8
import threading
from lxml import etree
import HanderRequest
class PageSpider(threading.Thread):
"""
页面url,请求多线程类
"""
def __init__(self,thread_name,page_queue,detail_queue):
super(PageSpider,self).__init__()
self.thread_name=thread_name
self.page_queue=page_queue
self.detail_queue=detail_queue
def parse_detail_url(self,content):
"""
解析page页获取详情页url
:param content: page页text
:return: 返回详情页url
"""
#页码返回数据html实例化
item_html=etree.HTML(content)
#解析出索引详情页URL
detail_urls=item_html.xpath("//h2[@class='entry-title']/a/@href")
for url in detail_urls:
#将详情页url存放到队列中
self.detail_queue.put(url)
def run(self):
#实际发送请求
print("{}启动".format(self.thread_name))
#需要从page_queue队列中获取数据
try:
while not self.page_queue.empty():
#从队列中获取数据,并设置为非阻塞状态
page_url= self.page_queue.get(block=False)
#请求页面链接
response_text=HanderRequest.send_reqeust(page_url)
if response_text:
#解析详情url
self.parse_detail_url(response_text)
except Exception as e:
print("{} 执行异常:{}".format(self.thread_name,e))
print("{}结束".format(self.thread_name))
3.DetailSpider.py
# coding:utf-8
import threading
from lxml import etree
import HanderRequest
class DetailSpider(threading.Thread):
"""
详情页url,请求详情页
"""
def __init__(self,thread_name,detail_queue,data_queue):
super(DetailSpider,self).__init__()
self.thread_name=thread_name
self.data_queue=data_queue
self.detail_queue=detail_queue
def run(self):
#实际发送请求
print("{}启动".format(self.thread_name))
#需要从page_queue队列中获取数据
try:
while not self.detail_queue.empty():
#从队列中获取数据,并设置为非阻塞状态
detail_url= self.detail_queue.get(block=False)
#请求页面链接
response_text=HanderRequest.send_reqeust(detail_url)
if response_text:
data={
"url":detail_url,
"html_content":response_text
}
#存放data_queuq数据
self.data_queue.put(data)
except Exception as e:
print("{} 执行异常:{}".format(self.thread_name,e))
print("{}结束".format(self.thread_name))
4.DataParse.py
# coding:utf-8
import threading
from lxml import etree
import Constant
class DataParse(threading.Thread):
"""
详情页数据处理
"""
def __init__(self,thread_name,data_queue,lock,sheet):
super(DataParse,self).__init__()
self.thread_name=thread_name
self.data_queue=data_queue
self.lock=lock
self.sheet=sheet
def __list_join(self,list):
return "".join(list)
def __parse(self,data):
"""
解析data_queue数据
保存至excel中
:return:
"""
html= etree.HTML(data.get("html_content"))
data={
"url":data.get("url"),
"title": self.__list_join(html.xpath("//h1[@class='entry-title']/text()")),
"put_date":self.__list_join(html.xpath("//span[@class='my-date']/text()")),
"content_html":self.__list_join(html.xpath("//div[@class='single-content']//p/text()"))
}
#多线程,使用lock来进行控制并发
with self.lock:
#写入Excel
for index,e in enumerate(data):
self.sheet.write(Constant.CURR_EXCEL_COL,index,data.get(e))
Constant.CURR_EXCEL_COL += 1
def run(self):
#实际发送请求
print("{}启动".format(self.thread_name))
#需要从page_queue队列中获取数据
try:
while not self.data_queue.empty():
#从队列中获取数据,并设置为非阻塞状态
data_content= self.data_queue.get(block=False)
#解析html数据
self.__parse(data_content)
except Exception as e:
print("{} 执行异常:{}".format(self.thread_name,e))
print("{}结束".format(self.thread_name))
5.Constant.py
# coding:utf-8
# excel写入到第几列
CURR_EXCEL_COL=1
6.HanderRequest.py
注意修改cookie
# coding:utf-8
import requests
def send_reqeust(url):
#发送数据
headers={
"Cookie":"xxx",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
response=requests.get(url,headers=headers)
if response.status_code==200 and response:
return response.text