1500字范文 > python爬虫入门(五)------beautifulsoup4库实例

python爬虫入门(五)------beautifulsoup4库实例

时间：2023-06-20 13:39:36

文章目录

爬取股票信息并保存到自己电脑上获取大学排名

爬取股票信息并保存到自己电脑上

import requestsfrom bs4 import BeautifulSoupimport redef getHTMLText(url, code='utf-8'):try:r = requests.get(url, timeout=30)r.raise_for_status()r.encoding = codereturn r.textexcept:return ""def getStockList(lst, stockURL):html = getHTMLText(stockURL, 'GB2312')soup = BeautifulSoup(html, 'html.parser')a = soup.find_all('a')for i in a:try:href = i.attrs['href']lst.append(re.findall(r'[s][hz]\d{6}', href)[0])except:continuereturn ""def getStockInfo(lst, stockURL, fpath):count = 0for stock in lst:url = stockURL + stock + ".html"#print(url)html = getHTMLText(url)try:if html == "":continueinfoDict = {}soup = BeautifulSoup(html, 'html.parser')stockInfo = soup.find('div', attrs={'class': 'stock-bets'})name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]#split():采用空格分隔开infoDict.update({'股票名称': name.text.split()[0]})keyList = stockInfo.find_all('dt')valueList = stockInfo.find_all('dd')for i in range(len(keyList)):key = keyList[i].textval = valueList[i].textinfoDict[key] = valwith open(fpath, 'a', encoding='utf-8') as f:f.write(str(infoDict) + '\n')count = count + 1print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)), end='')except:count = count + 1print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)), end='')continuedef main():stock_list_url = '/stocklist.html'stock_info_url = '/stock/'output_file = 'E://BaiduStockInfo.txt' slist = []getStockList(slist, stock_list_url)getStockInfo(slist, stock_info_url, output_file)main()

获取大学排名

#功能描述#输入：大学排名的url链接#输出：大学排名信息的屏幕输出（排名，大学名称，总分）#技术路线：requests-bs4#定向爬虫：仅对输入url进行爬取，不扩展爬取#程序设计结构设计'''1.从网络上获取大学排名网页内容getHTMLText()2.提取网页内容中信息到合适的数据结构fillUnivlIST()3.利用数据结构展示并输出结果PRINTuNIVlIST()'''import requestsfrom bs4 import BeautifulSoupimport bs4def getHTMLText(url):try:r = requests.get(url,timeout = 30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return ""def fillUnivList(ulist,html):soup = BeautifulSoup(html,"html.parser")for tr in soup.find('tbody').children:#检查是否是标签（去掉“\n”字符串）if isinstance(tr,bs4.element.Tag):#<tag>(..) 等价于 <tag>.find_all(..)tds = tr('td')ulist.append([tds[0].string,tds[1].string,tds[2].string])def printUnivList(ulist,num):#"{:^10}\t{:^6}\t{:^10}"下面的{3}表示用format的第三个(从下标0开始 )填充,也就是中文空格填充tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"#中文对齐问题的解决：采用中文字符的空格填充chr(12288)print(tplt.format("排名","学校","总分",chr(12288)))for i in range(num):u = ulist[i]print(tplt.format(u[0],u[1],u[2],chr(12288)))def main():uinfo = []url = '/zuihaodaxuepaiming.html'html = getHTMLText(url)fillUnivList(uinfo,html)printUnivList(uinfo,20)#20 univsmain()

参考学习网址：

/course/BIT-1001870001

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。