摘要:本文主要向大家介绍了【云计算】Ajax爬取详情页的url与实际页面上显示不符,通过具体的内容向大家展现,希望对大家学习云计算有所帮助。
本文主要向大家介绍了【云计算】Ajax爬取详情页的url与实际页面上显示不符,通过具体的内容向大家展现,希望对大家学习云计算有所帮助。
1. 使用py爬取今日头条图集图片
1.1 爬取图片并且下载到本地,同时,保存信息到mongoDB中。
toutiao.py
import json import os from hashlib import md5 import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import RequestException from urllib.parse import urlencode import re from config import * from multiprocessing import Pool client = pymongo.MongoClient(MONGO_URL, connect=False) db = client[MONGO_DB] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0', 'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'Keep-Alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } #获取页面信息 def get_page_index(offset, keyword): #定义headers头 data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': 3, 'from': 'gallery' } url = 'https://www.toutiao.com/search_content/' + urlencode(data) try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求失败') return None #索引 def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') #获取详情页信息 def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求详情页错误', url) return None # 获取页面详情 def parse_page_detail(html, url): soup = BeautifulSoup(html, 'lxml') # 标题正则对象 title_pattern = re.compile('title:(.*),', re.S) # 查找 result_title = re.search(title_pattern, html) if result_title: title_data = result_title.group(1) # print(title_data) # 图片正则表达式对象 image_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S) result_image = re.search(image_pattern, html) # 替换不需要的数据 jsonImage = re.sub(r'\\{1,2}','',result_image.group(1)) if result_image: image_data = json.loads(jsonImage) # print(result_image.group(1)) # 测试 if image_data and 'sub_images' in image_data.keys(): sub_images = image_data.get('sub_images') # print(sub_images) # 测试 # 装换成数组 images = [item.get('url') for item in sub_images] # 下载图片 for image in images: download_image(image) return { 'title': title_data, 'url': url, 'images': images } # 下载图片 def download_image(url): print('正在下载:', url) try: response = requests.get(url, headers=headers) if response.status_code == 200: save_image(response.content) return None except RequestException: print('请求图片错误', url) return None # 存储图片 def save_image(content): # 设置路径 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() # 存储到mongoDB def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('存储成功', result) return True return False def main(offset): html = get_page_index(offset, KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) if html: result = parse_page_detail(html, url) if result: save_to_mongo(result) if __name__ == '__main__': groups = [x*20 for x in range(GROUP_START, GROUP_END+1)] # 开启多线程下载 pool = Pool() pool.map(main, groups) # main()
在这里可以自定义全局配置信息:
config.py
MONGO_URL = 'localhost' MONGO_DB = 'toutiao' MONGO_TABLE = 'toutiao' # 如果没有mongodb密码就不用写数据库密码了,如果有就需要填写一下 GROUP_START = 1 GROUP_END = 20 KEYWORD = "街拍"
本文由职坐标整理并发布,希望对同学们有所帮助。了解更多详情请关注职坐标大数据云计算大数据安全频道!
您输入的评论内容中包含违禁敏感词
我知道了
请输入正确的手机号码
请输入正确的验证码
您今天的短信下发次数太多了,明天再试试吧!
我们会在第一时间安排职业规划师联系您!
您也可以联系我们的职业规划师咨询:
版权所有 职坐标-一站式IT培训就业服务领导者 沪ICP备13042190号-4
上海海同信息科技有限公司 Copyright ©2015 www.zhizuobiao.com,All Rights Reserved.
沪公网安备 31011502005948号