前言 网页爬虫是 Python 最经典的应用场景之一。无论是收集数据、监控价格、抓取资讯还是学术研究,爬虫都能帮你自动获取网页数据。
本文从零开始,带你用 Python 写出能用的爬虫。
⚠️ 法律提醒 :爬虫需遵守 robots.txt 协议,不要爬取个人隐私数据,控制访问频率,仅用于合法目的。
一、工具安装 1 pip install requests beautifulsoup4 lxml
库
作用
requests
发送 HTTP 请求,获取网页内容
beautifulsoup4
解析 HTML,提取数据
lxml
高性能 HTML/XML 解析器
二、第一个爬虫:获取网页标题 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 import requestsfrom bs4 import BeautifulSoupurl = "https://blog.iot2045.cn" headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" } response = requests.get(url, headers=headers) response.encoding = "utf-8" if response.status_code == 200 : print ("✅ 请求成功" ) else : print (f"❌ 请求失败: {response.status_code} " ) exit() soup = BeautifulSoup(response.text, "lxml" ) title = soup.title.text if soup.title else "无标题" print (f"网页标题: {title} " )links = soup.find_all("a" ) for link in links[:5 ]: href = link.get("href" ) text = link.text.strip() if href: print (f" {text[:30 ]:30s} → {href} " )
三、BeautifulSoup 核心用法 3.1 查找元素 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 soup.find("h1" ) soup.find_all("p" ) soup.find("div" , class_="post" ) soup.find_all("div" , class_="article" ) soup.find(id ="main" ) soup.find("meta" , attrs={"name" : "description" }) soup.find("a" , href=True ) soup.select(".post-title" ) soup.select("#header" ) soup.select("article h2" ) soup.select("div.post > h2" ) soup.select("a[href^='https']" )
3.2 提取内容 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 element = soup.find("h2" , class_="title" ) element.text element.string element.get_text(strip=True ) element["href" ] element.get("href" , "默认值" ) element.attrs element.parent element.children element.next_sibling element.find_next("p" )
四、实战:抓取博客文章列表 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 import requestsfrom bs4 import BeautifulSoupimport jsonimport timedef fetch_articles (url ): """抓取文章列表""" headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36" } resp = requests.get(url, headers=headers, timeout=10 ) resp.encoding = "utf-8" soup = BeautifulSoup(resp.text, "lxml" ) articles = [] for item in soup.select(".recent-post-item" ): title_elem = item.select_one(".recent-post-info a" ) date_elem = item.select_one("time" ) category_elem = item.select_one(".article-category a" ) title = title_elem.text.strip() if title_elem else "无标题" link = title_elem.get("href" ) if title_elem else "" date = date_elem.get("datetime" , "" ) if date_elem else "" category = category_elem.text.strip() if category_elem else "" articles.append({ "title" : title, "link" : link, "date" : date, "category" : category }) return articles if __name__ == "__main__" : url = "https://blog.iot2045.cn" articles = fetch_articles(url) print (f"✅ 抓取到 {len (articles)} 篇文章\n" ) for i, article in enumerate (articles, 1 ): print (f"{i} . {article['title' ]} " ) print (f" 日期: {article['date' ]} 分类: {article['category' ]} " ) print (f" 链接: {article['link' ]} \n" ) with open ("articles.json" , "w" , encoding="utf-8" ) as f: json.dump(articles, f, ensure_ascii=False , indent=2 ) print ("📁 已保存到 articles.json" )
五、数据存储 5.1 保存为 JSON 1 2 3 4 5 6 7 8 import jsonwith open ("data.json" , "w" , encoding="utf-8" ) as f: json.dump(articles, f, ensure_ascii=False , indent=2 ) with open ("data.json" , "r" , encoding="utf-8" ) as f: articles = json.load(f)
5.2 保存为 CSV 1 2 3 4 5 6 import csvwith open ("articles.csv" , "w" , newline="" , encoding="utf-8-sig" ) as f: writer = csv.DictWriter(f, fieldnames=["title" , "link" , "date" , "category" ]) writer.writeheader() writer.writerows(articles)
5.3 存入 MySQL 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import pymysqlconn = pymysql.connect(host="localhost" , user="root" , password="password" , database="spider_db" , charset="utf8mb4" ) cursor = conn.cursor() for article in articles: cursor.execute( "INSERT INTO articles (title, link, date, category) VALUES (%s, %s, %s, %s)" , (article["title" ], article["link" ], article["date" ], article["category" ]) ) conn.commit() conn.close()
5.4 下载图片 1 2 3 4 5 6 7 8 9 10 11 def download_image (img_url, save_path ): headers = {"User-Agent" : "Mozilla/5.0 ..." } resp = requests.get(img_url, headers=headers, stream=True ) if resp.status_code == 200 : with open (save_path, "wb" ) as f: for chunk in resp.iter_content(1024 ): f.write(chunk) print (f"✅ 下载完成: {save_path} " ) else : print (f"❌ 下载失败: {resp.status_code} " )
六、分页爬取 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 def fetch_all_pages (base_url, max_pages=5 ): """抓取多页数据""" all_articles = [] for page in range (1 , max_pages + 1 ): url = f"{base_url} /page/{page} /" print (f"抓取第 {page} 页: {url} " ) try : articles = fetch_articles(url) all_articles.extend(articles) print (f" → 获取 {len (articles)} 篇" ) time.sleep(1 ) except Exception as e: print (f" → 抓取失败: {e} " ) return all_articles
七、反爬与应对策略 7.1 设置请求头 1 2 3 4 5 headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..." , "Referer" : "https://www.google.com/" , "Accept-Language" : "zh-CN,zh;q=0.9,en;q=0.8" , }
7.2 控制请求频率 1 2 3 4 5 import timeimport randomtime.sleep(1 ) time.sleep(random.uniform(1 , 3 ))
7.3 使用 Session 保持登录状态 1 2 3 4 5 6 7 8 9 10 11 session = requests.Session() session.headers.update({ "User-Agent" : "Mozilla/5.0 ..." }) login_data = {"username" : "admin" , "password" : "123456" } session.post("https://example.com/login" , data=login_data) resp = session.get("https://example.com/profile" )
7.4 处理动态渲染页面 有些网站用 JavaScript 动态加载内容,requests 拿不到:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 from selenium import webdriverfrom selenium.webdriver.common.by import Bydriver = webdriver.Chrome() driver.get("https://example.com" ) import timetime.sleep(3 ) html = driver.page_source soup = BeautifulSoup(html, "lxml" ) driver.quit()
八、完整的爬虫脚本模板 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 """通用爬虫模板""" import requestsfrom bs4 import BeautifulSoupimport jsonimport timeimport randomfrom urllib.parse import urljoinclass BaseSpider : def __init__ (self ): self .session = requests.Session() self .session.headers.update({ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36" , "Accept-Language" : "zh-CN,zh;q=0.9" , }) def get (self, url, retries=3 ): """带重试的 GET 请求""" for attempt in range (retries): try : resp = self .session.get(url, timeout=10 ) resp.raise_for_status() return resp except Exception as e: if attempt == retries - 1 : raise e time.sleep(2 ** attempt) def parse (self, html, url="" ): """子类重写此方法实现解析逻辑""" raise NotImplementedError def crawl (self, urls, delay=1 ): """遍历 URL 列表抓取""" results = [] for i, url in enumerate (urls, 1 ): print (f"[{i} /{len (urls)} ] {url} " ) try : resp = self .get(url) data = self .parse(resp.text, url) if data: results.append(data) except Exception as e: print (f" ❌ 错误: {e} " ) time.sleep(delay + random.uniform(0 , 0.5 )) return results def save (self, data, filename="output.json" ): """保存到 JSON""" with open (filename, "w" , encoding="utf-8" ) as f: json.dump(data, f, ensure_ascii=False , indent=2 ) print (f"📁 已保存 {len (data)} 条数据到 {filename} " ) class MySpider (BaseSpider ): def parse (self, html, url="" ): soup = BeautifulSoup(html, "lxml" ) items = [] for elem in soup.select(".item" ): items.append({ "title" : elem.select_one(".title" ).text.strip(), "url" : urljoin(url, elem.select_one("a" ).get("href" , "" )), }) return items if __name__ == "__main__" : spider = MySpider() urls = [f"https://example.com/page/{i} " for i in range (1 , 6 )] results = spider.crawl(urls, delay=2 ) spider.save(results, "data.json" )
九、爬虫礼仪
✅ 遵守 robots.txt(在域名后加 /robots.txt 查看)
✅ 控制访问频率,不要给目标服务器造成压力
✅ 设置合理的 User-Agent,标识自己身份
❌ 不要爬取个人隐私数据
❌ 不要将爬取的数据用于商业用途(需确认许可)
❌ 不要绕过网站的认证/付费机制
结语 爬虫是一门工具体验很好 的技能——写几行代码就能自动抓取数据,成就感十足。
学习路径:
用 requests + BeautifulSoup 抓取静态页面(本文内容)
学会分析浏览器开发者工具 Network 面板 找到数据接口
遇到反爬再学代理 IP、验证码识别等进阶技巧
最好的练习:选一个你常访问的网站,试着抓取它。🕷️