爬取网页,html代码报错
毕业设计
1
import requests
from bs4 import BeautifulSoup
import pprint
import json
def download_all_htmls():
htmls=[]
for idx in range(40):
url=f"http://www.crazyant.net/page/{idx+1}"
print("craw html",url)
r=requests.get(url)
if r.status_code!=200:
raise Exception("error")
htmls.append(r.text)
return htmls
def parse_single_html(html):
soup =BeautifulSoup(html,'html.parser')
articles=soup.find_all("article")
datas=[]
for article in articles:
title_node=(
article
.find("h2",class_="entry-title")
.find("a")
)
title=title_node.get_text()
link=title_node["herf"]
tag_nodes=(
article
.find("footer",class_="enry-footer")
.find("span",class_="tages-links")
.find_all("a")
)
tages=[tag_node.get_text()for tage_node in tag_nodes]
datas.append(
{"title":title,"link":link,"tags":tags}
)
return datas
all_datas=[]
for html in htmls:
all_datas.extend(parse_single_html(html))
with open("all_article_links.json""w")as fout:
for data in all_datas:
fout.write(json.dumps(data,ensure_ascii=False)+"\n")
运行结果及报错内容 :
C:\Users\Administrator\Desktop\test\Scripts\python.exe C:/Users/Administrator/Desktop/test/yuyue.py
Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\test\yuyue.py", line 39, in <module>
for html in htmls:
NameError: name 'htmls' is not defined
进程已结束,退出代码1
我的解答思路和尝试过的方法 :应该是第38行htmls的问题,但是第33行tag_node.get和第35行"tags":tags,也有问题,但是运行正常
我想要达到的结果:希望程序运行正常,成功爬取网页
-
htmls是你上面函数download all htmls返回值,你需要先调用该函数
建议修改代码如下:
if __name__=='__main__': htmls=download_all_htmls() all_datas = [] for html in htmls: all_datas.extend(parse_single_html(html)) with open("all_article_links.json""w") as fout: for data in all_datas: fout.write(json.dumps(data, ensure_ascii=False) + "\n")
发表回复