网络爬虫详解
网络爬虫(Web Crawler)是一种自动浏览互联网并收集信息的程序。让我为您全面介绍爬虫的原理、技术和实践。
爬虫的基本概念
1. 什么是网络爬虫?
网络爬虫是一个自动化的程序,它:
· 按照一定的规则自动浏览万维网
· 下载网页内容
· 提取所需信息
· 存储和组织数据
2. 爬虫的工作流程
```
开始URL → 下载页面 → 解析内容 → 提取数据 → 存储数据
↓ ↓
发现新URL ← 处理链接 ← 清洗数据
```
爬虫技术栈
核心工具库
工具库 用途 特点
Requests HTTP请求 简单易用的HTTP库
BeautifulSoup HTML解析 解析HTML/XML,适合初学者
Scrapy 全功能框架 高性能,适合大型项目
Selenium 浏览器自动化 处理JavaScript动态内容
PyQuery HTML解析 jQuery语法,快速方便
爬虫实战示例
1. 基础爬虫 - 使用 Requests + BeautifulSoup
```python
import requests
from bs4 import BeautifulSoup
import time
import csv
def basic_crawler(url):
"""
基础爬虫示例:获取网页标题和链接
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
# 发送HTTP请求
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
# 检查请求是否成功
if response.status_code == 200:
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取网页标题
title = soup.title.string if soup.title else '无标题'
# 提取所有链接
links = []
for link in soup.find_all('a', href=True):
links.append({
'text': link.get_text().strip(),
'url': link['href']
})
return {
'title': title,
'url': url,
'links': links[:10] # 只返回前10个链接
}
else:
print(f"请求失败,状态码: {response.status_code}")
return None
except Exception as e:
print(f"发生错误: {e}")
return None
# 使用示例
if __name__ == "__main__":
result = basic_crawler('https://httpbin.org/html')
if result:
print(f"网页标题: {result['title']}")
print("前5个链接:")
for link in result['links'][:5]:
print(f" {link['text']} -> {link['url']}")
```
2. 数据提取爬虫 - 爬取图书信息
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
class BookScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.books_data = []
def scrape_books(self, base_url, pages=3):
"""
爬取图书信息
"""
for page in range(1, pages + 1):
print(f"正在爬取第 {page} 页...")
# 构建URL(这里以示例网站为例)
url = f"{base_url}?page={page}"
try:
response = self.session.get(url, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
# 假设图书信息在具有特定class的div中
book_elements = soup.find_all('div', class_='book-item')
for book in book_elements:
book_info = self.extract_book_info(book)
if book_info:
self.books_data.append(book_info)
# 礼貌性延迟,避免对服务器造成压力
time.sleep(1)
except Exception as e:
print(f"爬取第 {page} 页时出错: {e}")
continue
def extract_book_info(self, book_element):
"""
从图书元素中提取信息
"""
try:
title = book_element.find('h2').get_text().strip()
# 假设价格在具有price类的元素中
price_element = book_element.find('span', class_='price')
price = price_element.get_text().strip() if price_element else '未知'
# 假设作者信息
author_element = book_element.find('span', class_='author')
author = author_element.get_text().strip() if author_element else '未知'
return {
'title': title,
'author': author,
'price': price,
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}
except Exception as e:
print(f"提取图书信息时出错: {e}")
return None
def save_to_csv(self, filename='books.csv'):
"""
保存数据到CSV文件
"""
if self.books_data:
df = pd.DataFrame(self.books_data)
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据已保存到 {filename},共 {len(self.books_data)} 条记录")
else:
print("没有数据可保存")
# 使用示例
if __name__ == "__main__":
scraper = BookScraper()
# 注意:这里使用示例URL,实际使用时需要替换为真实的目标网站
scraper.scrape_books('https://example.com/books', pages=2)
scraper.save_to_csv()
```
3. 高级爬虫 - 使用 Scrapy 框架
首先安装Scrapy:pip install scrapy
```python
# 创建Scrapy项目后,在spiders目录下创建book_spider.py
import scrapy
import json
class BookSpider(scrapy.Spider):
name = 'book_spider'
# 自定义设置
custom_settings = {
'DOWNLOAD_DELAY': 1, # 下载延迟
'CONCURRENT_REQUESTS': 2, # 并发请求数
'FEED_FORMAT': 'json',
'FEED_URI': 'books_output.json'
}
def start_requests(self):
"""起始URL"""
urls = [
'https://httpbin.org/json', # 示例API
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""解析响应"""
try:
data = json.loads(response.text)
# 提取图书信息(根据实际API结构调整)
if 'slideshow' in data and 'slides' in data['slideshow']:
for slide in data['slideshow']['slides']:
yield {
'title': slide.get('title', ''),
'type': slide.get('type', ''),
'items': slide.get('items', [])
}
except json.JSONDecodeError:
self.logger.error('JSON解析错误')
```
创建Scrapy项目的命令:
```bash
scrapy startproject book_project
cd book_project
scrapy genspider example http://example.com
```
处理动态内容 - Selenium
```python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
class DynamicContentCrawler:
def __init__(self):
# 配置Chrome选项
chrome_options = Options()
chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=chrome_options)
self.wait = WebDriverWait(self.driver, 10)
def crawl_dynamic_content(self, url):
"""
爬取JavaScript动态加载的内容
"""
try:
self.driver.get(url)
# 等待特定元素加载完成
self.wait.until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# 模拟滚动加载(如果需要)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# 提取动态加载的内容
page_source = self.driver.page_source
# 使用BeautifulSoup解析
from bs4 import BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')
# 提取需要的数据
titles = soup.find_all('h1') + soup.find_all('h2') + soup.find_all('h3')
results = [title.get_text().strip() for title in titles]
return results
except Exception as e:
print(f"动态爬取出错: {e}")
return []
finally:
self.driver.quit()
# 使用示例
if __name__ == "__main__":
crawler = DynamicContentCrawler()
results = crawler.crawl_dynamic_content('https://example.com')
print("提取的标题:", results)
```
️ 爬虫伦理与法律问题
必须遵守的规则
```python
class EthicalCrawler:
def __init__(self):
self.respect_rules = True
def check_robots_txt(self, domain):
"""检查robots.txt"""
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url(f"{domain}/robots.txt")
rp.read()
return rp.can_fetch("*", f"{domain}/target-page")
def respectful_crawling(self):
"""礼貌的爬虫实践"""
rules = {
'robots_txt': '遵守robots.txt规则',
'rate_limiting': '设置合理的请求频率',
'working_hours': '避免在网站高峰时段爬取',
'data_usage': '仅用于合法目的',
'copyright': '尊重版权和知识产权'
}
return rules
# 请求头最佳实践
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
```
数据存储方案
多种存储方式示例
```python
import json
import csv
import sqlite3
import pandas as pd
from pymongo import MongoClient
class DataStorage:
"""数据存储类 - 支持多种存储格式"""
@staticmethod
def save_to_json(data, filename):
"""保存为JSON格式"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"数据已保存到 {filename}")
@staticmethod
def save_to_csv(data, filename):
"""保存为CSV格式"""
if data and isinstance(data[0], dict):
df = pd.DataFrame(data)
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据已保存到 {filename}")
@staticmethod
def save_to_sqlite(data, db_name, table_name):
"""保存到SQLite数据库"""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
# 创建表(如果不存在)
if data and isinstance(data[0], dict):
columns = list(data[0].keys())
create_table_sql = f'''
CREATE TABLE IF NOT EXISTS {table_name} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
{', '.join([f'{col} TEXT' for col in columns])}
)
'''
cursor.execute(create_table_sql)
# 插入数据
for item in data:
placeholders = ', '.join(['?' for _ in columns])
sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
cursor.execute(sql, [item[col] for col in columns])
conn.commit()
print(f"数据已保存到数据库 {db_name}.{table_name}")
conn.close()
@staticmethod
def save_to_mongodb(data, db_name, collection_name, connection_string="mongodb://localhost:27017/"):
"""保存到MongoDB"""
try:
client = MongoClient(connection_string)
db = client[db_name]
collection = db[collection_name]
if data:
collection.insert_many(data)
print(f"数据已保存到MongoDB: {db_name}.{collection_name}")
client.close()
except Exception as e:
print(f"MongoDB保存失败: {e}")
# 使用示例
sample_data = [
{'name': '图书1', 'price': '29.99', 'author': '作者A'},
{'name': '图书2', 'price': '39.99', 'author': '作者B'}
]
# 多种方式保存
DataStorage.save_to_json(sample_data, 'books.json')
DataStorage.save_to_csv(sample_data, 'books.csv')
DataStorage.save_to_sqlite(sample_data, 'books.db', 'books')
```
反爬虫对策与应对策略
常见反爬措施及应对
```python
class AntiAntiCrawler:
"""应对反爬虫的策略"""
def __init__(self):
self.proxies = self.get_proxies()
self.current_proxy_index = 0
def get_proxies(self):
"""获取代理IP列表(需要实际配置)"""
# 这里应该是从代理服务商获取的IP列表
return [
# 'http://proxy1:port',
# 'http://proxy2:port'
]
def rotate_proxy(self):
"""轮换代理IP"""
if self.proxies:
proxy = self.proxies[self.current_proxy_index]
self.current_proxy_index = (self.current_proxy_index + 1) % len(self.proxies)
return {'http': proxy, 'https': proxy}
return None
def get_random_delay(self, min_delay=1, max_delay=3):
"""生成随机延迟时间"""
import random
return random.uniform(min_delay, max_delay)
def random_headers(self):
"""生成随机请求头"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
import random
return {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/'
}
def handle_cookies(self):
"""处理Cookies"""
# 使用session保持cookies
session = requests.Session()
return session
```
爬虫项目管理
完整的爬虫项目结构
```
web_crawler_project/
├── spiders/ # 爬虫代码
│ ├── __init__.py
│ ├── basic_crawler.py
│ ├── api_crawler.py
│ └── dynamic_crawler.py
├── utils/ # 工具函数
│ ├── __init__.py
│ ├── storage.py
│ ├── parser.py
│ └── anti_anti.py
├── data/ # 数据存储
│ ├── raw/
│ └── processed/
├── logs/ # 日志文件
├── config/ # 配置文件
│ └── settings.py
├── requirements.txt # 依赖包
└── main.py # 主程序
```
配置管理
```python
# config/settings.py
import os
from datetime import datetime
class Config:
# 基础配置
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
DATA_DIR = os.path.join(BASE_DIR, 'data')
LOG_DIR = os.path.join(BASE_DIR, 'logs')
# 爬虫配置
REQUEST_TIMEOUT = 10
MAX_RETRIES = 3
DELAY_BETWEEN_REQUESTS = 1
# 日志配置
LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
LOG_FILE = os.path.join(LOG_DIR, f'crawler_{datetime.now().strftime("%Y%m%d")}.log')
@classmethod
def ensure_directories(cls):
"""确保必要的目录存在"""
for directory in [cls.DATA_DIR, cls.LOG_DIR]:
os.makedirs(directory, exist_ok=True)
```
最佳实践总结
爬虫开发原则
1. 遵守法律法规:尊重网站条款,不爬取敏感信息
2. 设置合理频率:避免对目标网站造成负担
3. 错误处理:完善的异常处理和重试机制
4. 数据清洗:对爬取的数据进行清洗和验证
5. 模块化设计:代码可复用、可维护
6. 日志记录:详细的运行日志便于调试
性能优化技巧
```python
# 使用并发提高效率
import concurrent.futures
def concurrent_crawler(urls):
"""并发爬虫"""
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(basic_crawler, url): url for url in urls}
results = []
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
if result:
results.append(result)
except Exception as e:
print(f"{url} 爬取失败: {e}")
return results
```
学习路径建议
1. 初级阶段:掌握Requests + BeautifulSoup基础爬虫
2. 中级阶段:学习Scrapy框架、数据存储、反爬对策
3. 高级阶段:分布式爬虫、JavaScript渲染、APP数据抓取
4. 专业方向:搜索引擎爬虫、大数据采集、实时数据流
网络爬虫是一个实践性很强的技能,建议从简单的项目开始,逐步挑战更复杂的场景。记住:能力越大,责任越大,请始终在法律和道德的框架内使用爬虫技术。
分享
喜欢收藏
转载请注明来自海坡下载,本文标题:《批量获取网页标题的常见问题有哪些(网络爬虫)》
京公网安备11000000000001号
京ICP备11000001号
还没有评论,来说两句吧...