Python Regular Expressions
Regular expressions (regex) are powerful tools for pattern matching and text manipulation. Learn how to use Python's re module for searching, matching, and replacing text patterns.
What are Regular Expressions?
Regular expressions are sequences of characters that define search patterns. They are used for string matching, searching, and manipulation. Python's re module provides support for regular expressions.
"Regular expressions are a powerful way to find and manipulate text patterns."
Basic Pattern Matching
Import re module and use basic matching:
import re
# Simple matching
text = "Hello, World!"
pattern = r"Hello"
if re.search(pattern, text):
print("Pattern found!")
else:
print("Pattern not found!")
# Match at the beginning
if re.match(r"Hello", text):
print("Text starts with Hello")
# Find all occurrences
text = "The cat sat on the mat."
matches = re.findall(r"at", text)
print(matches) # ['at', 'at', 'at']
Common Regex Patterns
import re
text = "My email is john@example.com and phone is 123-456-7890"
# Email pattern
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
print("Emails:", emails)
# Phone pattern
phone_pattern = r'\d{3}-\d{3}-\d{4}'
phones = re.findall(phone_pattern, text)
print("Phones:", phones)
# Word boundaries
words = re.findall(r'\b\w+\b', text)
print("Words:", words)
# Digits only
numbers = re.findall(r'\d+', text)
print("Numbers:", numbers)
Special Characters and Quantifiers
import re
text = "aa ab a123 a! @#$ hello123 world456 test_789"
# . (dot) - any character except newline
print(re.findall(r'a.', text)) # ['aa', 'ab', 'a1']
# * - zero or more
print(re.findall(r'a*', text)) # ['aa', '', '', 'a', '', '', '', '', '', '', '', '', '', '', '', '', '']
# + - one or more
print(re.findall(r'a+', text)) # ['aa', 'a']
# ? - zero or one
print(re.findall(r'ab?', text)) # ['a', 'ab']
# {n} - exactly n times
print(re.findall(r'\d{3}', text)) # ['123', '456', '789']
# {n,m} - between n and m times
print(re.findall(r'\d{2,3}', text)) # ['123', '456', '789']
# ^ - start of string
print(re.match(r'^aa', text)) # Match
# $ - end of string
print(re.search(r'789$', text)) # Match
Character Classes
import re
text = "Hello123 World456 Test_789 abcDEF"
# [abc] - any of a, b, or c
print(re.findall(r'[abc]', text)) # ['a', 'b', 'c']
# [a-z] - any lowercase letter
print(re.findall(r'[a-z]+', text)) # ['ello', 'orld', 'est', 'abc']
# [A-Z] - any uppercase letter
print(re.findall(r'[A-Z]+', text)) # ['H', 'W', 'T', 'DEF']
# [0-9] - any digit
print(re.findall(r'[0-9]+', text)) # ['123', '456', '789']
# [^abc] - any character except a, b, c
print(re.findall(r'[^a-zA-Z0-9]+', text)) # [' ', ' ', ' ', '_']
# \d - digit, \D - non-digit
print(re.findall(r'\d+', text)) # ['123', '456', '789']
print(re.findall(r'\D+', text)) # ['Hello', ' World', ' Test_', ' abcDEF']
# \w - word character, \W - non-word
print(re.findall(r'\w+', text)) # ['Hello123', 'World456', 'Test_789', 'abcDEF']
print(re.findall(r'\W+', text)) # [' ', ' ', ' ']
Groups and Capturing
import re
text = "John Doe: 123-456-7890, Jane Smith: 987-654-3210"
# Capturing groups
pattern = r'(\w+) (\w+): (\d{3}-\d{3}-\d{4})'
matches = re.findall(pattern, text)
print("Matches:", matches)
# [('John', 'Doe', '123-456-7890'), ('Jane', 'Smith', '987-654-3210')]
# Named groups
pattern_named = r'(?P<first>\w+) (?P<last>\w+): (?P<phone>\d{3}-\d{3}-\d{4})'
match = re.search(pattern_named, text)
if match:
print("First name:", match.group('first'))
print("Last name:", match.group('last'))
print("Phone:", match.group('phone'))
# Non-capturing groups
pattern_nc = r'(?:\w+ )+\w+: \d{3}-\d{3}-\d{4}'
print(re.findall(pattern_nc, text))
Search and Replace
import re
text = "The price is $100.50 and the discount is $20.25"
# Replace all dollar amounts with euros
result = re.sub(r'\$([0-9]+\.[0-9]{2})', r'€\1', text)
print(result) # The price is €100.50 and the discount is €20.25
# Replace with function
def celsius_to_fahrenheit(match):
celsius = float(match.group(1))
fahrenheit = celsius * 9/5 + 32
return f"{fahrenheit:.1f}°F"
text = "Temperature: 25°C, 30°C, 15°C"
result = re.sub(r'(\d+)°C', celsius_to_fahrenheit, text)
print(result) # Temperature: 77.0°F, 86.0°F, 59.0°F
# Replace with limit
text = "one two three four five"
result = re.sub(r'\w+', 'word', text, count=3)
print(result) # word word word four five
Split and Join with Regex
import re
text = "apple, banana; orange: grape, peach"
# Split on multiple delimiters
fruits = re.split(r'[,;:]\s*', text)
print(fruits) # ['apple', 'banana', 'orange', 'grape', 'peach']
# Split with capturing groups (keeps delimiters)
parts = re.split(r'([,;:])\s*', text)
print(parts) # ['apple', ',', 'banana', ';', 'orange', ':', 'grape', ',', 'peach']
# Join with pattern
numbers = ['123', '456', '789']
formatted = '-'.join(numbers)
print(formatted) # 123-456-789
# Format phone numbers
phones = ['1234567890', '9876543210']
formatted_phones = [re.sub(r'(\d{3})(\d{3})(\d{4})', r'(\1) \2-\3', phone) for phone in phones]
print(formatted_phones) # ['(123) 456-7890', '(987) 654-3210']
Advanced Patterns
import re
# Lookahead and lookbehind
text = "password123, admin456, user789"
# Positive lookahead
admins = re.findall(r'\w+(?=456)', text)
print("Admins:", admins) # ['admin']
# Negative lookahead
non_admins = re.findall(r'\w+(?!456)', text)
print("Non-admins:", non_admins) # ['password123', 'user789']
# Positive lookbehind
after_pass = re.findall(r'(?<=password)\d+', text)
print("After password:", after_pass) # ['123']
# Non-greedy matching
text = "<div>First</div><div>Second</div>"
matches = re.findall(r'<div>.*?</div>', text)
print("Non-greedy:", matches) # ['<div>First</div>', '<div>Second</div>']
# Greedy matching
greedy = re.findall(r'<div>.*</div>', text)
print("Greedy:", greedy) # ['<div>First</div><div>Second</div>']
Compiling Regular Expressions
For better performance with repeated use:
import re
# Compile pattern for reuse
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
text = """
John: john@example.com
Jane: jane@test.org
Bob: bob@company.net
"""
emails = email_pattern.findall(text)
print("Emails found:", emails)
# Using compiled pattern methods
for email in emails:
if email_pattern.match(email):
print(f"{email} is valid")
else:
print(f"{email} is invalid")
# Flags with compiled patterns
case_insensitive = re.compile(r'hello', re.IGNORECASE)
print(case_insensitive.findall("Hello HELLO hello")) # ['Hello', 'HELLO', 'hello']
Common Use Cases
import re
# Validate email
def is_valid_email(email):
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return re.match(pattern, email) is not None
# Extract URLs
def extract_urls(text):
url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:\w*))?)?'
return re.findall(url_pattern, text)
# Parse log files
log_line = "2023-12-01 10:30:45 ERROR Database connection failed"
timestamp_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
level_pattern = r'\b(ERROR|WARNING|INFO|DEBUG)\b'
message_pattern = r'(ERROR|WARNING|INFO|DEBUG)\s+(.+)$'
timestamp = re.search(timestamp_pattern, log_line).group(1)
level = re.search(level_pattern, log_line).group(0)
message = re.search(message_pattern, log_line).group(2)
print(f"Timestamp: {timestamp}")
print(f"Level: {level}")
print(f"Message: {message}")
# Format validation
def is_valid_phone(phone):
# Accepts formats: (123) 456-7890, 123-456-7890, 123.456.7890
pattern = r'^(\(\d{3}\)\s*|\d{3}[.-]?)\d{3}[.-]?\d{4}$'
return re.match(pattern, phone) is not None
phones_to_test = ["(123) 456-7890", "123-456-7890", "123.456.7890", "1234567890"]
for phone in phones_to_test:
print(f"{phone}: {is_valid_phone(phone)}")
Best Practices
- Use raw strings for patterns: r"pattern" to avoid escape issues
- Compile patterns for reuse: Better performance for repeated use
- Use specific patterns: Avoid .* when possible
- Test patterns thoroughly: Use online regex testers
- Use verbose mode for complex patterns: re.VERBOSE flag
- Handle exceptions: re.error for invalid patterns
- Use non-greedy matching: .*? instead of .* when appropriate
- Document complex patterns: Comments and examples
- Consider alternatives: str methods for simple operations
- Profile performance: Regex can be slow for large texts
Performance Tips
- Avoid catastrophic backtracking: Use atomic groups or possessive quantifiers
- Use anchors: ^ and $ to limit search scope
- Pre-compile patterns: Especially in loops
- Use appropriate flags: re.MULTILINE, re.DOTALL, etc.
- Consider string methods: str.find(), str.split() for simple cases
Regular expressions are incredibly powerful for text processing, but they can be complex. Start with simple patterns and gradually build complexity. Always test your patterns thoroughly with various inputs.