python爬虫入门之request模块爬虫

爬虫概念

爬虫呢,就是编写一个程序去模仿上网过程,让其去互联网上获取数据的过程。

爬虫分类

爬虫大致分为四种:

  • 通用爬虫:爬取整个页面。
  • 聚焦爬虫:只爬取页面中你需要的部分内容。
  • 增量式爬虫:动态爬取页面,尽可能保证爬到的是新的页面。
  • 深度爬虫:这我也不懂。

jupyter安装

命令行 pip install jupyter

request模块爬虫

  1. 先来个小案例,麦当劳找餐厅

    找餐厅页面:https://www.mcdonalds.com.cn/top/map

    请求的url:https://www.mcdonalds.com.cn/ajaxs/search_by_keywords

    import requests 
    
    url='https://www.mcdonalds.com.cn/ajaxs/search_by_keywords'
    site = input('输入查询的地点:')
    data = {
        'keywords':site,
        'city' :'佛山市' # 这里是自动定位的,也可以改为其他城市。
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36"
    }
    response = requests.post(url=url,data=data,headers=headers)
    
    response.json()
    

  2. 进入正题,爬虫的步骤。

    • 分析爬虫请求的url以及header

      如上面的案例

      你在请求时,页面并没有刷新,说明它为ajax请求,F12打开Network,选择XHR包类型,再次请求,就会看到你请求的包,点开即可看到,请求的url及请求所用的头部User-Agent。

      url='https://www.mcdonalds.com.cn/ajaxs/search_by_keywords'
      headers = {
          "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36"
      }
      
    • 分析请求时的data

      如上面案例

      data包含两个内容

      请求的data为键值对形式

      data{
      	'keywords': '新天地', # 用户输入地点
      	'city':'佛山市' # 自动定位获取城市信息
      }
      
    • 分析请求响应的类型

      如上面的案例,他返回的是json

      所以最后用response.json()即可

爬取图片

urllib包

import requests
from urllib import request,parse

url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1555570504446&di=a37c522cc2b379ff55e544e576853890&imgtype=0&src=http%3A%2F%2Fimg3.cache.netease.com%2Fphoto%2F0003%2F2017-05-19%2FCKPVN4JI3LF60003.jpg'

request.urlretrieve(url=url,filename='./星空.png')

糗事百科图片爬取

import re
import requests
import os
from urllib import request

if not os.path.exists('pic'):
    os.mkdir('pic')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36"
}
    
url = 'https://www.qiushibaike.com/imgrank/'

page_text = requests.get(url=url,headers=headers).text
#解析img标签的src属性值
ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'

img_url_list = re.findall(ex,page_text,re.S)

for img_url in img_url_list:
    img_url = 'https:'+img_url
    imgPath = 'pic/'+img_url.split('/')[-1]
    #对图片url发请求
    request.urlretrieve(url=img_url,filename=imgPath)
    print(imgPath+'下载成功!!!')

爬取药监总局相关信息

import requests
headers = {
	'User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'
}
#起始页,终止页
start = int(input('start page:'))
end = int(input('end page:'))
#
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'

id_list = []
#指定爬取的页码范围
for page in range(start,end+1):
    data = {
        'on': 'true',
        'page': str(page),
        'pageSize': '15',
        'productName':'',
        'conditionType': '1',
        'applyname':'',
        'applysn':''
    }
    first_dic = requests.post(url=url,headers=headers,data=data).json()
    for d in first_dic['list']:
        id_list.append(d['ID'])
detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
    data = {
        'id':id
    }
    detail_dic = requests.post(url=detail_url,headers=headers,data=data).json()
    print(detail_dic)
    

腾讯新闻爬取

# 爬取腾讯新闻
import requests
from bs4 import BeautifulSoup

url = 'https://new.qq.com/ninja/milite_focus.htm'


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36"
}

page_text = requests.get(url=url, headers=headers).text

soup = BeautifulSoup(page_text,'lxml')

links = soup.select("a.focus-item")
titles = soup.select("div.pic > img")

titles_dict = dict(zip(titles,links))


for news_title,news_link in titles_dict.items():
    title = news_title.get("alt")
    link = news_link.get("href")
    data = {
        '标题': title,
        '链接': link
    }
    print(data)

halo博客爬取

import os
import requests
import re
if not os.path.exists('halo'):
    os.mkdir('halo')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36"
}

url = 'https://blog.focuspoints.cn/archives'

page_text = requests.get(url=url, headers=headers).text

ex = '<div class="item-title">.*?<a href="(.*?)">.*?</div>'
url_list = re.findall(ex, page_text, re.S)

print(url_list)

num = 1
for purl in url_list:
    post_url = purl
    pPath = 'halo/' + purl.split('/')[-1] + '.md'
    print('------------------  ', num , '   ----------------')
    num+=1
    page_text1 = requests.get(url=post_url, headers=headers).text

    ex1 = '<div class="article-content" itemprop="articleBody">(.*?)</div>'
    
    posts = re.findall(ex1, page_text1, re.S)
    print(posts)
    posts = posts[0].encode()
    with open(pPath, 'wb') as fp:
        fp.write(posts)