Python爬取《动漫美女》

环境

Python：3.8

爬取地址：http://www.netbian.com/e/search/result/index.php?page=0&searchid=4279

使用模块： BeautifulSoup

思路

1、组合爬取网站的页面的规则

2、根据BeautifulSoup解析页面爬取html中的image地址页面

3、根据image地址页面获取真的图片地址

4、根据真的图片地址进行下载图片

功能实现

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124


# -*- coding: utf-8 -*-
# @Time    : 2023/1/9 18:52
# @Author  : 南宫乘风
# @Email   : 1794748404@qq.com
# @File    : Crawl_pictures.py
# @Software: PyCharm
import os

import requests

# 返回 页面的HTML 格式
from bs4 import BeautifulSoup

headers = {
    'Users-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}



# 
def get_url_info(page):
    """
    根据地址请求，返回页面的text
    :param page:
    :return:
    """
    url = f'http://www.xxxxxx.com/e/search/result/index.php?page={page}&searchid=4279'
    res = requests.get(url, headers=headers).text
    return res


# 获取第一层 图片地址
def get_url_list(res):
    """
    根据 界面的text 值，过滤特定的标签图片，组合界面地址
    :param res:
    :return:
    """
    soup = BeautifulSoup(res, "html.parser")
    res.encode('UTF-8').decode('UTF-8')
    # clearfix
    div_class = soup.find(name='div', attrs={"class": "list"})
    a_name = div_class.find_all(name='a')
    image_url = []
    for i in a_name:
        image_url.append('http://www.netbian.com' + i['href'])
    return image_url


# 爬取 图片真实地址
def get_image_url(image_url):
    """
    根据组合的界面地址，获取真是的图片地址
    :param image_url:
    :return:
    """
    all_images_url = []
    for new_url in image_url:
        res = requests.get(new_url, headers=headers).text
        soup = BeautifulSoup(res, "html.parser")
        res.encode('UTF-8').decode('UTF-8')

        div_class = soup.find(name='div', attrs={"class": "pic"})
        a_name = div_class.find_all(name='img')

        for i in a_name:
            all_images_url.append(i['src'])
    return all_images_url


def create_directory():
    """
    查询目录是否存在，不存在就引用
    :return:
    """
    # 创建文件夹
    isExists = os.path.exists('./4kdongman')
    if not isExists:
        os.makedirs('./4kdongman')


def start_main(numbers):
    """
    开始根据页数进行 爬出页面地址
    :param numbers:
    :return:
    """
    all_list_urls = []
    for i in range(0, numbers):
        # print(i)
        res = get_url_info(i)
        image_url = get_url_list(res)
        print(image_url)
        all_url = get_image_url(image_url)
        all_list_urls = all_list_urls + all_url
    print(all_list_urls)
    return all_list_urls
    # print(len(all_list_urls), all_list_urls)


def url_download(all_list_urls):
    """
    根据列表 的url 下载图片
    :param all_list_urls:
    :return:
    """
    create_directory()
    count = 1
    for i in all_list_urls:
        print(i)
        img_data = requests.get(url=i, headers=headers).content
        filePath = './4kdongman/' + str(count) + '.jpg'
        with open(filePath, 'wb') as fp:
            fp.write(img_data)
        print('%s,下载成功' % count)
        count = count + 1


if __name__ == '__main__':
    """
    主函数，定义启动
    """
    all_list_urls = start_main(10)
    url_download(all_list_urls)

点击并拖拽以移动