python爬虫线程池案例详解(梨视频短视频爬取)

python爬虫-梨视频短视频爬取(线程池)

示例代码

import requestsfrom lxml import etreeimport randomfrom multiprocessing.dummy import Pool# 多进程要传的方法,多进程pool.map()传的第二个参数是一个迭代器对象# 而传的get_video方法也要有一个迭代器参数def get_video(dic):  headers = {    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'    }  video_data = requests.get(url = dic['url'] , headers = headers).content  print(dic['name']+'开始下载')    # 有的文件名中包含空格,在并发执行时会报错,这里用随机数给文件起名了  #path = dic['name']+'.mp4'会报错  path = "./lishipin/"+str(int(random.random()*100)) + '.mp4'    with open(path,'wb') as fp:    fp.write(video_data)  print(dic['name']+'下载成功')def main():    # web_url:梨视频官网  web_url = 'https://www.pearvideo.com/category_5'  headers = {    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'    }    # web_page_tex:官网页面  web_page_text = requests.get(url = web_url,headers = headers).text    tree = etree.HTML(web_page_text)    # 解析梨视频官网“生活栏”中的所有li标签,遍历li标签,提取视频的url  li_list = tree.xpath('//*[@id="listvideoListUl"]/li')  rea_urls=[]  for li in li_list:     # video_name获取视频的名称    video_name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'        # 加上'https://www.pearvideo.com/'得到完整的video_url    video_url = 'https://www.pearvideo.com/'+li.xpath("./div/a/@href")[0]        # 通过官网界面提取的url,并不是真正的url,    # 因为MP4的视频是动态加载出来的,所以通过ajax请求获取视频的真实网址    # 但是通过分析发现,ajax请求获取的网址是一个伪网址,和真实网址有区别(cont...)      ##真地址:https://video.pearvideo.com/mp4/third/20210208/cont-1719874-15690592-205957-ld.mp4      ##                          1719874      ##伪地址:https://video.pearvideo.com/mp4/third/20210208/1612867876612-15690592-205957-ld.mp4    # 通过得到的video_url可以分析到 真假网址 不同的细节之处--countId    # 通过ajax请求向video_url发起get请求,需要加countId和mrd参数    # 分析video_url得到countId,mrd是一个随机小树    countId = video_url.split("/")[-1].split("_")[1]    mrd = random.random()        # 加'Referer'参数,否则会显示该视频已下架了    headers = {      'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56',      'Referer': 'https://www.pearvideo.com/video_' + countId      }    ajax_url = 'https://www.pearvideo.com/videoStatus.jsp'        # 利用ajax请求获取伪地址    # https://www.pearvideo.com/videoStatus.jsp?contId=1719874&mrd=0.7759942025851074    params = {        'contId': str(countId),        'mrd': str(mrd)      }        # 通过ajax请求,发起get请求得到一个json串    ajax_json = requests.get(url = ajax_url,headers = headers,params = params).json()        # 得到的是 假地址    fake_url = ajax_json['videoInfo']['videos']['srcUrl']        # 对假地址进行处理,并把刚才的countId组合起来    fake_url_list = fake_url.split('/')    end = fake_url_list.pop()  #删除不必要的字符串    end_list = end.split("-")    end_url = ""  #end_url是一个结尾字符串    for i in range(len(end_list)-1):      end_url = end_url + "-"+ end_list[i+1]        # 真实的地址,先用假地址,然后组合countId    rea_url=""    for element in fake_url_list:      rea_url=rea_url+element+"/"    rea_url=rea_url+"cont-"+str(countId) + end_url        # print(rea_url)            dic = {        'url':rea_url,        'name':video_name      }    rea_urls.append(dic)      #print(rea_urls)  pool = Pool(4)  pool.map(get_video,rea_urls)  pool.close()  pool.join()  if __name__ == '__main__':  main()

知识点扩展:

python爬虫下载视频(梨视频)

梨视频示例:Ctrl+Alt+L格式化代码

import reimport requestsimport hashlibimport time# print(respose.status_code)# 响应的状态码# print(respose.content) #返回字节信息# print(respose.text) #返回文本内容 mainurl = "https://www.pearvideo.com/"videourl = "http://www.pearvideo.com/video_1499584"headers={    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',    'Accept-Encoding':'gzip, deflate, sdch',    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',    'Accept-Language':'zh-CN,zh;q=0.8',  }# 获取视频链接列表def geturls(url):  res=requests.get(url)  urls=re.findall('class="vervideo-tbd".*?href="(.*?)" rel="external nofollow" ',res.text,re.S)  urllist=[]  for i in urls:    prefix='https://www.pearvideo.com/'    urllist.append(prefix+i)  return urllist# 获取视频链接并下载(写入到硬盘)def getvideo(url):  res=requests.get(url,headers)  mp4url=re.findall('srcUrl="(.*?/.mp4)"',res.text,re.S)[0]  video=requests.get(mp4url)  m = hashlib.md5()  m.update(url.encode('utf-8'))  m.update(str(time.time()).encode('utf-8'))  filename = r'%s.mp4' % m.hexdigest()  print(filename)  with open("/home/tony/文档/爬虫视频/%s.mp4"%filename,'wb') as f:    f.write(video.content)def main():  video_urllist=geturls(mainurl)  for i in video_urllist:    getvideo(i)if __name__=='__main__':  main()

到此这篇关于python爬虫线程池案例详解(梨视频短视频爬取)的文章就介绍到这了,更多相关python爬虫梨视频短视频爬取内容请搜索 以前的文章或继续浏览下面的相关文章希望大家以后多多支持 !

相关文章

发表新评论