1500字范文 > Python3爬取猎聘网招聘信息

Python3爬取猎聘网招聘信息

时间：2020-07-14 09:53:42

相关推荐

Python3爬取猎聘网招聘信息

#爬取猎聘网金融行业分类下银行的4000条招聘数据，并存入csv文件#requests,lxml是第三方库，需要额外安装#下方有修改后的版本import requests,csv,timefrom lxml import etreefrom requests.exceptions import RequestExceptionurl ='/zhaopin/?headckid=4ba8c02991d96408&industries=130'num=1def get_one_page(url):try:headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}response = requests.get(url, headers=headers)if response.status_code == 200:return etree.HTML(response.text)return Noneexcept RequestException:return Nonedef parse_one_page(html):#使用xpath定位数据for job in html.xpath('//div[@class="sojob-item-main clearfix"]'):city=job.xpath('div/p/a/text()')[0].strip()name=job.xpath('div/h3/a/text()')[0].strip()url=job.xpath('div/h3/a/@href')[0].strip()firm=job.xpath('div//p[@class="company-name"]/a/text()')[0].strip()salary=job.xpath('div/p/span/text()')[0].strip()exper=job.xpath('div/p//span[3]/text()')[0].strip()edu=job.xpath('div/p//span[2]/text()')[0].strip()time=job.xpath('div//p[@class="time-info clearfix"]/time/text()')[0].strip()yield{'城市':city,'职位':name,'网址':url,'公司':firm,'薪酬':salary,'工作经验要求':exper,'学历要求':edu,'发布时间':time }def init_csv():#初始化csv文件，写入标题行和爬取时间等相关信息crawl_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())with open('result.csv','a',newline='') as my_csv:my_writer=csv.writer(my_csv)my_writer.writerow(['爬取对象：猎聘网金融行业银行',url,crawl_time])my_writer.writerow(['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间'])my_csv.close()def write_to_csv(content):#写入csv文件with open('result.csv','a',newline='') as my_csv:fieldnames= ['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间']my_writer=csv.DictWriter(my_csv,fieldnames=fieldnames)my_writer.writerow(content)def main(offset):crawl_url=url+'&curPage='+str(offset)html = get_one_page(crawl_url)for item in parse_one_page(html):write_to_csv(item)if __name__ == '__main__':init_csv()for i in range(1,100):try:main(i)time.sleep(1)#避免频繁访问，被封IPexcept BaseException as e:print(num,e)finally:num+=1

运行结果如下：

============== RESTART: C:/Users/Administrator/Desktop/test.py ==============50 'gbk' codec can't encode character '\xa0' in position 11: illegal multibyte sequence66 'gbk' codec can't encode character '\xa0' in position 11: illegal multibyte sequence95 list index out of range96 list index out of range>>>

有4个异常抛出

修改后：

#爬取猎聘网金融行业分类下银行的4000条招聘数据，并存入csv文件#requests,lxml是第三方库，需要额外安装import requests,csv,timefrom lxml import etreefrom requests.exceptions import RequestExceptionurl ='/zhaopin/?headckid=4ba8c02991d96408&industries=130'num_1,num_2=0,0def get_one_page(url):try:headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}response = requests.get(url, headers=headers)if response.status_code == 200:return etree.HTML(response.text)return Noneexcept RequestException:return Nonedef parse_one_page(html):#使用xpath定位数据global num_2for job in html.xpath('//div[@class="sojob-item-main clearfix"]'):num_2+=1try:city=job.xpath('div/p/a/text()')[0].strip()name=job.xpath('div/h3/a/text()')[0].strip()url=job.xpath('div/h3/a/@href')[0].strip()firm=job.xpath('div//p[@class="company-name"]/a/text()')[0].strip()salary=job.xpath('div/p/span/text()')[0].strip()exper=job.xpath('div/p//span[3]/text()')[0].strip()edu=job.xpath('div/p//span[2]/text()')[0].strip()time=job.xpath('div//p[@class="time-info clearfix"]/time/text()')[0].strip()yield{'城市':city,'职位':name,'网址':url,'公司':firm,'薪酬':salary,'工作经验要求':exper,'学历要求':edu,'发布时间':time,'页面':num_1,'条目':num_2}except BaseException as e:print(num_1,num_2,e)def init_csv():#初始化csv文件，写入标题行和爬取时间等相关信息crawl_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())with open('result.csv','a',newline='') as my_csv:my_writer=csv.writer(my_csv)my_writer.writerow(['爬取对象：猎聘网金融行业银行',url,crawl_time])my_writer.writerow(['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间','页面','条目'])my_csv.close()def write_to_csv(content):#写入csv文件with open('result.csv','a',newline='') as my_csv:fieldnames= ['城市', '职位', '网址','公司','薪酬', '工作经验要求', '学历要求','发布时间','页面','条目']my_writer=csv.DictWriter(my_csv,fieldnames=fieldnames)try:my_writer.writerow(content)except BaseException as e:print(num_1,num_2,e)def main(offset):global num_1,num_2num_1,num_2=num_1+1,0crawl_url=url+'&curPage='+str(offset)html = get_one_page(crawl_url)for item in parse_one_page(html):write_to_csv(item)if __name__ == '__main__':init_csv()for i in range(1,100):main(i)time.sleep(1)#避免频繁访问，被封IP

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。