Web Scraping
Web scraping is the process of extracting data from websites. Learn how to use Python libraries like requests and BeautifulSoup to scrape web content, handle dynamic sites, and respect website terms of service.
What is Web Scraping?
Web scraping is the automated process of extracting data from websites. It involves fetching web pages, parsing the HTML content, and extracting the desired information. Python provides excellent libraries for web scraping including requests for HTTP requests and BeautifulSoup for HTML parsing.
"Web scraping allows you to programmatically extract data from websites, turning unstructured web content into structured data."
Basic Web Scraping with Requests
Getting started with HTTP requests:
import requests
# Basic GET request
response = requests.get('https://httpbin.org/html')
print(f"Status Code: {response.status_code}")
print(f"Content Type: {response.headers['content-type']}")
print(f"Content Length: {len(response.text)}")
# Check if request was successful
if response.status_code == 200:
print("Request successful!")
print("First 200 characters:")
print(response.text[:200])
else:
print(f"Request failed with status code: {response.status_code}")
# GET request with parameters
params = {'key': 'value', 'another': 'param'}
response = requests.get('https://httpbin.org/get', params=params)
print(f"\nURL with parameters: {response.url}")
print(f"Response: {response.json()}")
# POST request
data = {'username': 'user', 'password': 'pass'}
response = requests.post('https://httpbin.org/post', data=data)
print(f"\nPOST response: {response.json()}")
# Headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
response = requests.get('https://httpbin.org/headers', headers=headers)
print(f"\nHeaders response: {response.json()}")
BeautifulSoup Basics
Parsing HTML with BeautifulSoup:
import requests
from bs4 import BeautifulSoup
# Fetch a webpage
url = 'https://quotes.toscrape.com/'
response = requests.get(url)
# Create BeautifulSoup object
soup = BeautifulSoup(response.content, 'html.parser')
print("Page Title:", soup.title.text if soup.title else "No title found")
print("Page Title Tag:", soup.title.name if soup.title else "No title tag")
# Find elements by tag name
h1_tags = soup.find_all('h1')
print(f"\nFound {len(h1_tags)} h1 tags:")
for h1 in h1_tags:
print(f" {h1.text.strip()}")
# Find elements by class
quote_divs = soup.find_all('div', class_='quote')
print(f"\nFound {len(quote_divs)} quote divs")
# Find elements by id
navbar = soup.find(id='navbar')
if navbar:
print(f"Navbar found: {navbar.text.strip()[:50]}...")
# Get all links
links = soup.find_all('a')
print(f"\nFound {len(links)} links:")
for link in links[:5]: # Show first 5
href = link.get('href')
text = link.text.strip()
print(f" {text}: {href}")
# Navigate the DOM
first_quote = soup.find('div', class_='quote')
if first_quote:
text_span = first_quote.find('span', class_='text')
author_span = first_quote.find('small', class_='author')
if text_span and author_span:
print(f"\nFirst quote: '{text_span.text}' by {author_span.text}")
Advanced BeautifulSoup Selectors
Powerful selection techniques:
import requests
from bs4 import BeautifulSoup
url = 'https://quotes.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# CSS selectors
quotes = soup.select('.quote')
print(f"Found {len(quotes)} quotes using CSS selector")
# Select by attribute
author_links = soup.select('a[href*="/author/"]')
print(f"Found {len(author_links)} author links")
# Select nested elements
quote_texts = soup.select('.quote .text')
print(f"Found {len(quote_texts)} quote texts")
# Select by multiple classes
tags = soup.select('.tag')
print(f"Found {len(tags)} tags")
# Extract quote data
for quote in quotes[:3]: # First 3 quotes
text = quote.select_one('.text')
author = quote.select_one('.author')
tags = quote.select('.tag')
if text and author:
tag_list = [tag.text for tag in tags]
print(f"\nQuote: {text.text}")
print(f"Author: {author.text}")
print(f"Tags: {', '.join(tag_list)}")
# Navigate siblings and parents
first_quote = soup.select_one('.quote')
if first_quote:
# Next sibling
next_quote = first_quote.find_next_sibling('div', class_='quote')
if next_quote:
print(f"\nNext quote text: {next_quote.select_one('.text').text[:50]}...")
# Parent element
parent = first_quote.parent
print(f"Parent tag: {parent.name}")
# Find elements with specific attributes
about_link = soup.find('a', href='/about/')
if about_link:
print(f"About link text: {about_link.text}")
# Regular expressions with BeautifulSoup
import re
author_pattern = re.compile(r'/author/')
author_pages = soup.find_all('a', href=author_pattern)
print(f"\nFound {len(author_pages)} author page links")
Handling Dynamic Content
Dealing with JavaScript-rendered content:
# For dynamic content, you might need Selenium or similar tools
# Note: This requires additional setup and may not work in all environments
# Example with Selenium (conceptual)
"""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# Set up headless browser
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
try:
# Navigate to page
driver.get("https://quotes.toscrape.com/js/")
# Wait for content to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "quote"))
)
# Now scrape the dynamic content
quotes = driver.find_elements(By.CLASS_NAME, "quote")
print(f"Found {len(quotes)} dynamic quotes")
for quote in quotes[:3]:
text = quote.find_element(By.CLASS_NAME, "text").text
author = quote.find_element(By.CLASS_NAME, "author").text
print(f"Quote: {text[:50]}... by {author}")
finally:
driver.quit()
"""
# Alternative: Check for API endpoints
import requests
import json
# Many sites provide APIs - check for them first
def check_for_api(url):
# Common API patterns
api_patterns = [
'/api/', '/v1/', '/v2/', '/graphql',
'?format=json', '&format=json'
]
for pattern in api_patterns:
api_url = url + pattern
try:
response = requests.get(api_url, timeout=5)
if response.status_code == 200:
content_type = response.headers.get('content-type', '')
if 'json' in content_type.lower():
print(f"Potential API found: {api_url}")
return response.json()
except:
continue
return None
# Example usage
api_data = check_for_api("https://quotes.toscrape.com")
if api_data:
print("API data found!")
else:
print("No obvious API endpoints found")
Data Storage and Export
Saving scraped data to files:
import requests
from bs4 import BeautifulSoup
import csv
import json
import sqlite3
import time
def scrape_quotes():
"""Scrape quotes from quotes.toscrape.com"""
base_url = 'https://quotes.toscrape.com'
quotes_data = []
# Scrape multiple pages
for page in range(1, 3): # First 2 pages
url = f"{base_url}/page/{page}/"
response = requests.get(url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
quotes = soup.select('.quote')
for quote in quotes:
text_elem = quote.select_one('.text')
author_elem = quote.select_one('.author')
tags_elems = quote.select('.tag')
if text_elem and author_elem:
quote_data = {
'text': text_elem.text.strip('"'),
'author': author_elem.text,
'tags': [tag.text for tag in tags_elems],
'page': page
}
quotes_data.append(quote_data)
time.sleep(1) # Be respectful to the server
return quotes_data
def save_to_csv(quotes, filename='quotes.csv'):
"""Save quotes to CSV file"""
if not quotes:
return
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['text', 'author', 'tags', 'page']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for quote in quotes:
# Convert tags list to string
quote_copy = quote.copy()
quote_copy['tags'] = ', '.join(quote['tags'])
writer.writerow(quote_copy)
print(f"Saved {len(quotes)} quotes to {filename}")
def save_to_json(quotes, filename='quotes.json'):
"""Save quotes to JSON file"""
with open(filename, 'w', encoding='utf-8') as jsonfile:
json.dump(quotes, jsonfile, indent=2, ensure_ascii=False)
print(f"Saved {len(quotes)} quotes to {filename}")
def save_to_sqlite(quotes, db_name='quotes.db'):
"""Save quotes to SQLite database"""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
# Create table
cursor.execute('''
CREATE TABLE IF NOT EXISTS quotes (
id INTEGER PRIMARY KEY,
text TEXT NOT NULL,
author TEXT NOT NULL,
tags TEXT,
page INTEGER
)
''')
# Insert data
for quote in quotes:
cursor.execute('''
INSERT INTO quotes (text, author, tags, page)
VALUES (?, ?, ?, ?)
''', (
quote['text'],
quote['author'],
', '.join(quote['tags']),
quote['page']
))
conn.commit()
conn.close()
print(f"Saved {len(quotes)} quotes to {db_name}")
# Scrape and save data
quotes = scrape_quotes()
print(f"Scraped {len(quotes)} quotes")
if quotes:
save_to_csv(quotes)
save_to_json(quotes)
save_to_sqlite(quotes)
# Display sample
print("\nSample quote:")
sample = quotes[0]
print(f"Text: {sample['text']}")
print(f"Author: {sample['author']}")
print(f"Tags: {', '.join(sample['tags'])}")
Handling Errors and Rate Limiting
Robust web scraping with error handling:
import requests
from bs4 import BeautifulSoup
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class WebScraper:
def __init__(self):
self.session = requests.Session()
# Configure retries
retry_strategy = Retry(
total=3,
status_forcelist=[429, 500, 502, 503, 504],
backoff_factor=1
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# Set headers to mimic browser
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
})
def get_page(self, url, timeout=10):
"""Get page with error handling"""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def parse_quotes(self, html_content):
"""Parse quotes from HTML"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
quotes = []
quote_elements = soup.select('.quote')
for quote_elem in quote_elements:
try:
text = quote_elem.select_one('.text')
author = quote_elem.select_one('.author')
if text and author:
quote_data = {
'text': text.text.strip('"'),
'author': author.text,
'tags': [tag.text for tag in quote_elem.select('.tag')]
}
quotes.append(quote_data)
except Exception as e:
print(f"Error parsing quote: {e}")
continue
return quotes
except Exception as e:
print(f"Error parsing HTML: {e}")
return []
def scrape_with_rate_limit(self, base_url, max_pages=5):
"""Scrape with rate limiting"""
all_quotes = []
for page in range(1, max_pages + 1):
url = f"{base_url}/page/{page}/"
print(f"Scraping page {page}...")
response = self.get_page(url)
if not response:
break
quotes = self.parse_quotes(response.content)
all_quotes.extend(quotes)
print(f"Found {len(quotes)} quotes on page {page}")
# Rate limiting: random delay between 1-3 seconds
delay = random.uniform(1, 3)
print(f"Waiting {delay:.1f} seconds...")
time.sleep(delay)
return all_quotes
# Usage
scraper = WebScraper()
quotes = scraper.scrape_with_rate_limit('https://quotes.toscrape.com', max_pages=3)
print(f"\nTotal quotes scraped: {len(quotes)}")
if quotes:
print("Sample quote:")
print(f" Text: {quotes[0]['text'][:100]}...")
print(f" Author: {quotes[0]['author']}")
print(f" Tags: {', '.join(quotes[0]['tags'])}")
Legal and Ethical Considerations
Important guidelines for web scraping:
# Always check robots.txt
import requests
from urllib.parse import urlparse
def check_robots_txt(url):
"""Check if scraping is allowed by robots.txt"""
parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
try:
response = requests.get(robots_url)
if response.status_code == 200:
print("Robots.txt content:")
print(response.text)
# Simple check for User-agent: *
if "User-agent: *" in response.text:
lines = response.text.split('\n')
for line in lines:
if line.startswith('Disallow:'):
path = line.split(':', 1)[1].strip()
if path == '/' or path == '':
print("WARNING: Site disallows all scraping!")
return False
else:
print("No robots.txt found")
except Exception as e:
print(f"Error checking robots.txt: {e}")
return True
# Check terms of service (manual process)
def respect_terms_of_service():
"""
Always check the website's Terms of Service before scraping.
Look for sections about:
- Data usage policies
- API usage requirements
- Rate limiting
- Prohibited activities
"""
print("Remember to:")
print("1. Check Terms of Service")
print("2. Respect robots.txt")
print("3. Don't overload servers")
print("4. Identify your scraper (User-Agent)")
print("5. Cache results when possible")
print("6. Be transparent about your intentions")
# Example usage
url = "https://quotes.toscrape.com"
if check_robots_txt(url):
print("Proceeding with scraping...")
else:
print("Scraping not allowed!")
respect_terms_of_service()
Best Practices
- Check robots.txt and Terms of Service: Respect website policies
- Use appropriate delays: Don't overwhelm servers
- Handle errors gracefully: Network issues are common
- Identify your scraper: Use descriptive User-Agent headers
- Cache results: Avoid re-scraping the same content
- Use APIs when available: More reliable than scraping
- Monitor your impact: Check server response times
- Store data responsibly: Respect copyright and privacy
- Start small: Test with single pages before scaling
- Keep code maintainable: Handle different page structures
Common Challenges and Solutions
- Dynamic content: Use Selenium or check for APIs
- CAPTCHAs: Respect anti-bot measures
- Rate limiting: Implement delays and backoff strategies
- IP blocking: Use proxies or respect limits
- Changing layouts: Write robust selectors
- Encoding issues: Handle different character encodings
- JavaScript redirects: Check final URLs
- Cookies and sessions: Maintain session state when needed
Web scraping is a powerful tool for data collection, but it must be done responsibly. Always respect website terms of service, implement proper rate limiting, and consider using official APIs when available. BeautifulSoup and requests provide excellent tools for building robust scrapers.