爬详情内容写的不对,怎么改

毕设驿站 其他问答 1
import requests,re
from openpyxl import Workbook
# wb=Workbook()
# ws =wb.active
# ws.append(["详情介绍"])
  headers = {
  "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36  (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
 }
# https://www.hjutv.cn/detail_6119.html
detail_url="https://www.hjutv.cn/detail_{}.html"
for i in range(1,11):
  url =detail_url.format(i)
  print(url)
  response = requests.get(url=url, headers=headers)
  html = response.text
  detail_pattern = re.compile(r'<div ><span>(.*?)</span></div>')
  detail =detail_pattern.findall(html)
  print(detail)

回复

共2条回复 我来回复
  • 源码码头网
    这个人很懒,什么都没有留下~
    评论
    import requests,re
    from openpyxl import Workbook
    wb=Workbook()
    ws =wb.active
    ws.append(["韩剧名","主演","详情介绍"])
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
    }
    page_url="https://www.hjutv.cn/show_2________{}___.html"
    detail_url="https://www.hjutv.cn/detail_{}.html"
    response =requests.get(url=page_url,headers=headers)
    for i in range(1,2):#################采集多页修改这里
      url = page_url.format(i)
      response = requests.get(url=url, headers=headers)
      html = response.text 
      tvplay_pattern = re.compile(r'<li >(.*?)</li>',re.S)
      tvplay_list = tvplay_pattern .findall(html)
      # 匹配剧名
      name_pattern =re.compile(r'<p ><a[^>]+>([^<]+)',re.S)
      # 匹配演员名
      actor_name_p_pattern =re.compile(r'<p >(.*?)</p>',re.S)#获取所有演员a标签html代码
      actor_name_pattern =re.compile(r'<a[^>]+>([^<]+)',re.S)#获取演员
      #URL正则
      href_pattern=re.compile(r'detail_(\d+)',re.S)
      #介绍正则
      intro_pattern=re.compile(r'<div ><span>([^<]+)',re.S)
      for tvplay in tvplay_list:
        #try:
          name = name_pattern.findall(tvplay)
          if len(name)==0:#没找到名称
            name="---"
          else:
            name = name[0]
          actor_name=""
          actor_name_p=actor_name_p_pattern.findall(tvplay)
          if len(actor_name_p):
            actor_name=actor_name_pattern.findall(actor_name_p[0])
            actor_name=','.join(actor_name)
    
          #############获取详情介绍
          href=href_pattern.findall(tvplay)[0]
          href=detail_url.format(href)
          detail=requests.get(href,headers=headers).text
          detail=intro_pattern.findall(detail)
          if len(detail):
            detail=detail[0]
          else:
            detail="---"
    
          #############获取详情介绍
          print(name,actor_name,detail)
          ws.append([name,actor_name,detail])
        #except:
          #print(tvplay)
          #break
    wb.save("韩剧.xlsx")
    
    0条评论
  • 毕设向导
    这个人很懒,什么都没有留下~
    评论
    import requests
    from lxml import etree
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      " Chrome/93.0.4577.63 Safari/537.36"
    }
    detail_url = "https://www.hjutv.cn/detail_{}.html"
    for i in range(1, 11):
        url = detail_url.format(i)
        print(url)
        response = requests.get(url=url, headers=headers)
        html = etree.HTML(response.text)
        detail = html.xpath('//div[@class="ddb_a5df_df3685a content_desc context clearfix"]//span/text()')
        print(detail)
    
    0条评论

发表回复

登录后才能评论