1500字范文 > python 爬虫小说使用无头浏览器 + 自动化爬虫

python 爬虫小说使用无头浏览器 + 自动化爬虫

时间：2023-07-16 05:37:00

仅供学习，请勿商业行为！，未经允许请勿转载

获取到搜索接口和请求方法和请求参数当前是post 方法

请求参数为

获取对应小说的详情介绍页

对应类、对应浏览器驱动获取方法

python selenium4 使用无界面浏览器爬虫并存储mysql数据库_fuchto的博客-CSDN博客_python 无界面浏览器浏览器驱动需要查看对应浏览器版本进行下载selenium · PyPI/project/selenium/浏览器设置中查看当前版本from selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom mon.by import By# select 选择框需要引入 select 类fro.../fuchto/article/details/124480885?spm=1001..3001.5502

废话不多说直接上代码

import requestsfrom bs4 import BeautifulSoupimport reimport osimport timefrom selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom mon.by import By# select 选择框需要引入 select 类from selenium.webdriver.support.select import Selectfrom selenium.webdriver.chrome.options import Optionsfrom mon.action_chains import ActionChainsimport mathimport jsondef bubbleSort(arr):n = len(arr)# 遍历所有数组元素for i in range(n):# Last i elements are already in placefor j in range(0, n - i - 1):if arr[j] > arr[j + 1]:arr[j],arr[j + 1] = arr[j + 1],arr[j]return arrdef str_replaces(arr):arrs = []for itme in arr:arrs.append(itme.replace(".html", ' '))return arrsdef start():fiction = input("请输入你想获取的小说：")# 搜索小说# post 方法search_url = "私聊获取请求地址"# 头部信息# 请求参数search_data = {'searchtype': 'articlename','searchkey':fiction}# 发送请求response = requests.post(search_url,search_data)# 设置获取到的内容编码response.encoding = 'utf-8'# print(response.text)# 声明当前字符串用于匹配soup = BeautifulSoup(response.text,"html.parser")# 获取到 a标签fiction_a_link = soup.select("#content > div > ul > a")# 循环获取到的小说for itme in fiction_a_link:# 正则匹配规则 http 开始到 html 结束pattern = pile('http.+html')# 匹配获取内容fiction_link = pattern.findall(str(itme))# 当前文件绝对路径basedir = os.path.abspath(os.path.dirname(__file__))# 获取小说名称fiction_pattern = pile(r'alt="(\w+)"')fiction_name = fiction_pattern.findall(str(itme))print("\n小说名称：" + fiction_name[0])#小说目录dir = basedir + "\\" + fiction_name[0]#创建目录if os.path.exists(dir) == False:print("\n正在创建目录" + dir)# 创建文件目录os.mkdir(fiction_name[0])else:print("\n目录已存在")# 获取小说详情print("\n详情地址"+fiction_link[0])details_respsone = requests.get(fiction_link[0])details_respsone.encoding = 'utf-8'details_soup = BeautifulSoup(details_respsone.text,"html.parser")fiction_list = details_soup.select("#content > div.articleInfo > div.articleInfoRight > ol > p.right > a")print("\n列表页地址")print(fiction_list)list_pattern = pile(r'href=\"(.+?)\"')str_fiction_list = str(fiction_list[0])lsit_link = list_pattern.findall(str_fiction_list)print("\n列表请求地址")print(lsit_link)lists_response = requests.get(str(lsit_link[0]))lists_response.encoding = lists_response.apparent_encodinglists_soup = BeautifulSoup(lists_response.text,'html.parser')lists_html = lists_soup.select("#newlist")lists_pattern = pile(r'href=\"(.+?)\"')# 章节链接chapter_link = lists_pattern.findall(str(lists_html))# print("\n章节链接")# 去除字符串 .htmlchapter_link = str_replaces(chapter_link)# 从小到大排序chapter_link = bubbleSort(chapter_link)for value in chapter_link:# if int(value) >= 189883:# 章节详情链接chapter_details_link = str(lsit_link[0])+value.strip()+".html"#使用无头浏览器访问chrome_options = Options()chrome_options.add_argument('--headless')chrome_options.add_argument('--disable-gpu')s = Service("D:\pythonVendor\chrome\chromedriver.exe")driver = webdriver.Chrome(service=s, options=chrome_options)# 打开网站driver.get(chapter_details_link)# 获取网页内容chapter_soup = BeautifulSoup(driver.page_source,'html.parser')title_html = chapter_soup.select("body > div.readerListBody > div.readerTitle")title_pattern = pile(r"<h1>(.+?)</h1>")title_name = title_pattern.findall(str(title_html))print("\n章节名称："+str(title_name[0]))# 获取文章内容content = chapter_soup.select("#content")content_pattern = pile(r'<p data-id="99" .+?>(.+?)</p>',re.S)content = content_pattern.sub('', str(content[0]))pattern = pile(r'<[^>]+>', re.S)content = pattern.sub('\r\n', content)content = content.replace('最新网址：', '')content = content.replace('', '')# 创建文章文件chapter_dir = dir+"\\"+str(title_name[0])+".txt";chapter_dir = chapter_dir.replace("：",' ')chapter_dir = chapter_dir.replace("|",' ')chapter = open(chapter_dir,"w",encoding="utf-8")chapter.writelines(content)print("\n章节："+str(title_name[0])+"保存成功")driver.close()chapter.close()time.sleep(30)print("小说"+fiction_name[0]+"已全部爬取")if __name__ == "__main__":start()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。