用 Python 写一个网页爬虫

前言

网页爬虫是 Python 最经典的应用场景之一。无论是收集数据、监控价格、抓取资讯还是学术研究，爬虫都能帮你自动获取网页数据。

本文从零开始，带你用 Python 写出能用的爬虫。

⚠️ 法律提醒：爬虫需遵守 robots.txt 协议，不要爬取个人隐私数据，控制访问频率，仅用于合法目的。

一、工具安装

1	pip install requests beautifulsoup4 lxml

库	作用
`requests`	发送 HTTP 请求，获取网页内容
`beautifulsoup4`	解析 HTML，提取数据
`lxml`	高性能 HTML/XML 解析器

二、第一个爬虫：获取网页标题

import requests
from bs4 import BeautifulSoup

# 1. 发送请求
url = "https://blog.iot2045.cn"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
response.encoding = "utf-8"    # 设置编码

# 2. 检查是否成功
if response.status_code == 200:
    print("✅ 请求成功")
else:
    print(f"❌ 请求失败: {response.status_code}")
    exit()

# 3. 解析 HTML
soup = BeautifulSoup(response.text, "lxml")

# 4. 提取数据
title = soup.title.text if soup.title else "无标题"
print(f"网页标题: {title}")

# 提取所有链接
links = soup.find_all("a")
for link in links[:5]:    # 只显示前 5 个
    href = link.get("href")
    text = link.text.strip()
    if href:
        print(f"  {text[:30]:30s} → {href}")

三、BeautifulSoup 核心用法

3.1 查找元素

# === 按标签查找 ===
soup.find("h1")              # 找到第一个 h1
soup.find_all("p")           # 找到所有 p 标签

# === 按 class 查找 ===
soup.find("div", class_="post")                    # class 是 Python 关键字，用 class_
soup.find_all("div", class_="article")

# === 按 id 查找 ===
soup.find(id="main")

# === 按属性查找 ===
soup.find("meta", attrs={"name": "description"})
soup.find("a", href=True)                          # 有 href 属性的 a 标签

# === CSS 选择器（最灵活！推荐）===
soup.select(".post-title")                         # class 选择器
soup.select("#header")                             # id 选择器
soup.select("article h2")                          # 后代选择器
soup.select("div.post > h2")                       # 子代选择器
soup.select("a[href^='https']")                    # href 以 https 开头的链接

3.2 提取内容

element = soup.find("h2", class_="title")

# 文本
element.text                    # 所有文本（含子元素）
element.string                  # 直接文本（子元素为 None）
element.get_text(strip=True)    # 去首尾空白

# 属性
element["href"]                 # 获取属性
element.get("href", "默认值")     # 安全获取
element.attrs                   # 所有属性字典

# 导航
element.parent                  # 父元素
element.children                # 直接子元素
element.next_sibling            # 下一个兄弟元素
element.find_next("p")          # 后面第一个 p 标签

四、实战：抓取博客文章列表

import requests
from bs4 import BeautifulSoup
import json
import time

def fetch_articles(url):
    """抓取文章列表"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
    }
    
    resp = requests.get(url, headers=headers, timeout=10)
    resp.encoding = "utf-8"
    soup = BeautifulSoup(resp.text, "lxml")
    
    articles = []
    
    # 根据实际页面结构调整选择器
    for item in soup.select(".recent-post-item"):
        title_elem = item.select_one(".recent-post-info a")
        date_elem = item.select_one("time")
        category_elem = item.select_one(".article-category a")
        
        title = title_elem.text.strip() if title_elem else "无标题"
        link = title_elem.get("href") if title_elem else ""
        date = date_elem.get("datetime", "") if date_elem else ""
        category = category_elem.text.strip() if category_elem else ""
        
        articles.append({
            "title": title,
            "link": link,
            "date": date,
            "category": category
        })
    
    return articles

# 抓取并保存
if __name__ == "__main__":
    url = "https://blog.iot2045.cn"
    articles = fetch_articles(url)
    
    print(f"✅ 抓取到 {len(articles)} 篇文章\n")
    for i, article in enumerate(articles, 1):
        print(f"{i}. {article['title']}")
        print(f"   日期: {article['date']}  分类: {article['category']}")
        print(f"   链接: {article['link']}\n")
    
    # 保存为 JSON
    with open("articles.json", "w", encoding="utf-8") as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)
    print("📁 已保存到 articles.json")

五、数据存储

5.1 保存为 JSON

import json

with open("data.json", "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=2)

# 读取
with open("data.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

5.2 保存为 CSV

import csv

with open("articles.csv", "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "link", "date", "category"])
    writer.writeheader()
    writer.writerows(articles)

5.3 存入 MySQL

import pymysql

conn = pymysql.connect(host="localhost", user="root",
                       password="password", database="spider_db",
                       charset="utf8mb4")
cursor = conn.cursor()

for article in articles:
    cursor.execute(
        "INSERT INTO articles (title, link, date, category) VALUES (%s, %s, %s, %s)",
        (article["title"], article["link"], article["date"], article["category"])
    )

conn.commit()
conn.close()

5.4 下载图片

def download_image(img_url, save_path):
    headers = {"User-Agent": "Mozilla/5.0 ..."}
    resp = requests.get(img_url, headers=headers, stream=True)
    
    if resp.status_code == 200:
        with open(save_path, "wb") as f:
            for chunk in resp.iter_content(1024):
                f.write(chunk)
        print(f"✅ 下载完成: {save_path}")
    else:
        print(f"❌ 下载失败: {resp.status_code}")

六、分页爬取

def fetch_all_pages(base_url, max_pages=5):
    """抓取多页数据"""
    all_articles = []
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}/page/{page}/"
        print(f"抓取第 {page} 页: {url}")
        
        try:
            articles = fetch_articles(url)
            all_articles.extend(articles)
            print(f"  → 获取 {len(articles)} 篇")
            
            # 礼貌等待，避免被封
            time.sleep(1)
        except Exception as e:
            print(f"  → 抓取失败: {e}")
    
    return all_articles

七、反爬与应对策略

7.1 设置请求头

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...",
    "Referer": "https://www.google.com/",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
}

7.2 控制请求频率

import time
import random

time.sleep(1)                           # 固定等待
time.sleep(random.uniform(1, 3))        # 随机等待 1~3 秒

7.3 使用 Session 保持登录状态

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 ..."
})

# 登录
login_data = {"username": "admin", "password": "123456"}
session.post("https://example.com/login", data=login_data)

# 之后用 session 发请求自动带 Cookie
resp = session.get("https://example.com/profile")

7.4 处理动态渲染页面

有些网站用 JavaScript 动态加载内容，requests 拿不到：

# 方案 1：直接用浏览器开发者工具找到 API 接口
# Network → XHR → 找到数据接口 → 直接请求接口

# 方案 2：使用 Selenium（真实浏览器渲染，重武器）
# pip install selenium

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://example.com")

# 等待 JS 渲染
import time
time.sleep(3)

# 获取渲染后的 HTML
html = driver.page_source
soup = BeautifulSoup(html, "lxml")

driver.quit()

八、完整的爬虫脚本模板

#!/usr/bin/env python3
"""通用爬虫模板"""
import requests
from bs4 import BeautifulSoup
import json
import time
import random
from urllib.parse import urljoin

class BaseSpider:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                           "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
            "Accept-Language": "zh-CN,zh;q=0.9",
        })
    
    def get(self, url, retries=3):
        """带重试的 GET 请求"""
        for attempt in range(retries):
            try:
                resp = self.session.get(url, timeout=10)
                resp.raise_for_status()
                return resp
            except Exception as e:
                if attempt == retries - 1:
                    raise e
                time.sleep(2 ** attempt)  # 指数退避
    
    def parse(self, html, url=""):
        """子类重写此方法实现解析逻辑"""
        raise NotImplementedError
    
    def crawl(self, urls, delay=1):
        """遍历 URL 列表抓取"""
        results = []
        for i, url in enumerate(urls, 1):
            print(f"[{i}/{len(urls)}] {url}")
            try:
                resp = self.get(url)
                data = self.parse(resp.text, url)
                if data:
                    results.append(data)
            except Exception as e:
                print(f"  ❌ 错误: {e}")
            
            time.sleep(delay + random.uniform(0, 0.5))
        
        return results
    
    def save(self, data, filename="output.json"):
        """保存到 JSON"""
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"📁 已保存 {len(data)} 条数据到 {filename}")

# ===== 使用 =====
class MySpider(BaseSpider):
    def parse(self, html, url=""):
        soup = BeautifulSoup(html, "lxml")
        # 根据目标网站结构调整
        items = []
        for elem in soup.select(".item"):
            items.append({
                "title": elem.select_one(".title").text.strip(),
                "url": urljoin(url, elem.select_one("a").get("href", "")),
            })
        return items

if __name__ == "__main__":
    spider = MySpider()
    urls = [f"https://example.com/page/{i}" for i in range(1, 6)]
    results = spider.crawl(urls, delay=2)
    spider.save(results, "data.json")

九、爬虫礼仪

✅ 遵守 robots.txt（在域名后加 /robots.txt 查看）
✅ 控制访问频率，不要给目标服务器造成压力
✅ 设置合理的 User-Agent，标识自己身份
❌ 不要爬取个人隐私数据
❌ 不要将爬取的数据用于商业用途（需确认许可）
❌ 不要绕过网站的认证/付费机制

结语

爬虫是一门工具体验很好的技能——写几行代码就能自动抓取数据，成就感十足。

学习路径：

用 requests + BeautifulSoup 抓取静态页面（本文内容）
学会分析浏览器开发者工具 Network 面板找到数据接口
遇到反爬再学代理 IP、验证码识别等进阶技巧

最好的练习：选一个你常访问的网站，试着抓取它。🕷️