前言

网页爬虫是 Python 最经典的应用场景之一。无论是收集数据、监控价格、抓取资讯还是学术研究,爬虫都能帮你自动获取网页数据。

本文从零开始,带你用 Python 写出能用的爬虫。

⚠️ 法律提醒:爬虫需遵守 robots.txt 协议,不要爬取个人隐私数据,控制访问频率,仅用于合法目的。


一、工具安装

1
pip install requests beautifulsoup4 lxml
作用
requests 发送 HTTP 请求,获取网页内容
beautifulsoup4 解析 HTML,提取数据
lxml 高性能 HTML/XML 解析器

二、第一个爬虫:获取网页标题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from bs4 import BeautifulSoup

# 1. 发送请求
url = "https://blog.iot2045.cn"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
response.encoding = "utf-8" # 设置编码

# 2. 检查是否成功
if response.status_code == 200:
print("✅ 请求成功")
else:
print(f"❌ 请求失败: {response.status_code}")
exit()

# 3. 解析 HTML
soup = BeautifulSoup(response.text, "lxml")

# 4. 提取数据
title = soup.title.text if soup.title else "无标题"
print(f"网页标题: {title}")

# 提取所有链接
links = soup.find_all("a")
for link in links[:5]: # 只显示前 5 个
href = link.get("href")
text = link.text.strip()
if href:
print(f" {text[:30]:30s}{href}")

三、BeautifulSoup 核心用法

3.1 查找元素

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# === 按标签查找 ===
soup.find("h1") # 找到第一个 h1
soup.find_all("p") # 找到所有 p 标签

# === 按 class 查找 ===
soup.find("div", class_="post") # class 是 Python 关键字,用 class_
soup.find_all("div", class_="article")

# === 按 id 查找 ===
soup.find(id="main")

# === 按属性查找 ===
soup.find("meta", attrs={"name": "description"})
soup.find("a", href=True) # 有 href 属性的 a 标签

# === CSS 选择器(最灵活!推荐)===
soup.select(".post-title") # class 选择器
soup.select("#header") # id 选择器
soup.select("article h2") # 后代选择器
soup.select("div.post > h2") # 子代选择器
soup.select("a[href^='https']") # href 以 https 开头的链接

3.2 提取内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
element = soup.find("h2", class_="title")

# 文本
element.text # 所有文本(含子元素)
element.string # 直接文本(子元素为 None)
element.get_text(strip=True) # 去首尾空白

# 属性
element["href"] # 获取属性
element.get("href", "默认值") # 安全获取
element.attrs # 所有属性字典

# 导航
element.parent # 父元素
element.children # 直接子元素
element.next_sibling # 下一个兄弟元素
element.find_next("p") # 后面第一个 p 标签

四、实战:抓取博客文章列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
from bs4 import BeautifulSoup
import json
import time

def fetch_articles(url):
"""抓取文章列表"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
}

resp = requests.get(url, headers=headers, timeout=10)
resp.encoding = "utf-8"
soup = BeautifulSoup(resp.text, "lxml")

articles = []

# 根据实际页面结构调整选择器
for item in soup.select(".recent-post-item"):
title_elem = item.select_one(".recent-post-info a")
date_elem = item.select_one("time")
category_elem = item.select_one(".article-category a")

title = title_elem.text.strip() if title_elem else "无标题"
link = title_elem.get("href") if title_elem else ""
date = date_elem.get("datetime", "") if date_elem else ""
category = category_elem.text.strip() if category_elem else ""

articles.append({
"title": title,
"link": link,
"date": date,
"category": category
})

return articles

# 抓取并保存
if __name__ == "__main__":
url = "https://blog.iot2045.cn"
articles = fetch_articles(url)

print(f"✅ 抓取到 {len(articles)} 篇文章\n")
for i, article in enumerate(articles, 1):
print(f"{i}. {article['title']}")
print(f" 日期: {article['date']} 分类: {article['category']}")
print(f" 链接: {article['link']}\n")

# 保存为 JSON
with open("articles.json", "w", encoding="utf-8") as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
print("📁 已保存到 articles.json")

五、数据存储

5.1 保存为 JSON

1
2
3
4
5
6
7
8
import json

with open("data.json", "w", encoding="utf-8") as f:
json.dump(articles, f, ensure_ascii=False, indent=2)

# 读取
with open("data.json", "r", encoding="utf-8") as f:
articles = json.load(f)

5.2 保存为 CSV

1
2
3
4
5
6
import csv

with open("articles.csv", "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=["title", "link", "date", "category"])
writer.writeheader()
writer.writerows(articles)

5.3 存入 MySQL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import pymysql

conn = pymysql.connect(host="localhost", user="root",
password="password", database="spider_db",
charset="utf8mb4")
cursor = conn.cursor()

for article in articles:
cursor.execute(
"INSERT INTO articles (title, link, date, category) VALUES (%s, %s, %s, %s)",
(article["title"], article["link"], article["date"], article["category"])
)

conn.commit()
conn.close()

5.4 下载图片

1
2
3
4
5
6
7
8
9
10
11
def download_image(img_url, save_path):
headers = {"User-Agent": "Mozilla/5.0 ..."}
resp = requests.get(img_url, headers=headers, stream=True)

if resp.status_code == 200:
with open(save_path, "wb") as f:
for chunk in resp.iter_content(1024):
f.write(chunk)
print(f"✅ 下载完成: {save_path}")
else:
print(f"❌ 下载失败: {resp.status_code}")

六、分页爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def fetch_all_pages(base_url, max_pages=5):
"""抓取多页数据"""
all_articles = []

for page in range(1, max_pages + 1):
url = f"{base_url}/page/{page}/"
print(f"抓取第 {page} 页: {url}")

try:
articles = fetch_articles(url)
all_articles.extend(articles)
print(f" → 获取 {len(articles)} 篇")

# 礼貌等待,避免被封
time.sleep(1)
except Exception as e:
print(f" → 抓取失败: {e}")

return all_articles

七、反爬与应对策略

7.1 设置请求头

1
2
3
4
5
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...",
"Referer": "https://www.google.com/",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
}

7.2 控制请求频率

1
2
3
4
5
import time
import random

time.sleep(1) # 固定等待
time.sleep(random.uniform(1, 3)) # 随机等待 1~3 秒

7.3 使用 Session 保持登录状态

1
2
3
4
5
6
7
8
9
10
11
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 ..."
})

# 登录
login_data = {"username": "admin", "password": "123456"}
session.post("https://example.com/login", data=login_data)

# 之后用 session 发请求自动带 Cookie
resp = session.get("https://example.com/profile")

7.4 处理动态渲染页面

有些网站用 JavaScript 动态加载内容,requests 拿不到:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 方案 1:直接用浏览器开发者工具找到 API 接口
# Network → XHR → 找到数据接口 → 直接请求接口

# 方案 2:使用 Selenium(真实浏览器渲染,重武器)
# pip install selenium

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://example.com")

# 等待 JS 渲染
import time
time.sleep(3)

# 获取渲染后的 HTML
html = driver.page_source
soup = BeautifulSoup(html, "lxml")

driver.quit()

八、完整的爬虫脚本模板

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python3
"""通用爬虫模板"""
import requests
from bs4 import BeautifulSoup
import json
import time
import random
from urllib.parse import urljoin

class BaseSpider:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
})

def get(self, url, retries=3):
"""带重试的 GET 请求"""
for attempt in range(retries):
try:
resp = self.session.get(url, timeout=10)
resp.raise_for_status()
return resp
except Exception as e:
if attempt == retries - 1:
raise e
time.sleep(2 ** attempt) # 指数退避

def parse(self, html, url=""):
"""子类重写此方法实现解析逻辑"""
raise NotImplementedError

def crawl(self, urls, delay=1):
"""遍历 URL 列表抓取"""
results = []
for i, url in enumerate(urls, 1):
print(f"[{i}/{len(urls)}] {url}")
try:
resp = self.get(url)
data = self.parse(resp.text, url)
if data:
results.append(data)
except Exception as e:
print(f" ❌ 错误: {e}")

time.sleep(delay + random.uniform(0, 0.5))

return results

def save(self, data, filename="output.json"):
"""保存到 JSON"""
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"📁 已保存 {len(data)} 条数据到 {filename}")

# ===== 使用 =====
class MySpider(BaseSpider):
def parse(self, html, url=""):
soup = BeautifulSoup(html, "lxml")
# 根据目标网站结构调整
items = []
for elem in soup.select(".item"):
items.append({
"title": elem.select_one(".title").text.strip(),
"url": urljoin(url, elem.select_one("a").get("href", "")),
})
return items

if __name__ == "__main__":
spider = MySpider()
urls = [f"https://example.com/page/{i}" for i in range(1, 6)]
results = spider.crawl(urls, delay=2)
spider.save(results, "data.json")

九、爬虫礼仪

  1. ✅ 遵守 robots.txt(在域名后加 /robots.txt 查看)
  2. ✅ 控制访问频率,不要给目标服务器造成压力
  3. ✅ 设置合理的 User-Agent,标识自己身份
  4. ❌ 不要爬取个人隐私数据
  5. ❌ 不要将爬取的数据用于商业用途(需确认许可)
  6. ❌ 不要绕过网站的认证/付费机制

结语

爬虫是一门工具体验很好的技能——写几行代码就能自动抓取数据,成就感十足。

学习路径:

  1. requests + BeautifulSoup 抓取静态页面(本文内容)
  2. 学会分析浏览器开发者工具 Network 面板找到数据接口
  3. 遇到反爬再学代理 IP、验证码识别等进阶技巧

最好的练习:选一个你常访问的网站,试着抓取它。🕷️