Virtual assistance

Python Regular Expressions

Regular expressions (regex) are powerful tools for pattern matching and text manipulation. Learn how to use Python's re module for searching, matching, and replacing text patterns.

Python Regular Expressions

What are Regular Expressions?

Regular expressions are sequences of characters that define search patterns. They are used for string matching, searching, and manipulation. Python's re module provides support for regular expressions.

"Regular expressions are a powerful way to find and manipulate text patterns."

Basic Pattern Matching

Import re module and use basic matching:

import re

# Simple matching
text = "Hello, World!"
pattern = r"Hello"

if re.search(pattern, text):
    print("Pattern found!")
else:
    print("Pattern not found!")

# Match at the beginning
if re.match(r"Hello", text):
    print("Text starts with Hello")

# Find all occurrences
text = "The cat sat on the mat."
matches = re.findall(r"at", text)
print(matches)  # ['at', 'at', 'at']

Common Regex Patterns

import re

text = "My email is john@example.com and phone is 123-456-7890"

# Email pattern
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
print("Emails:", emails)

# Phone pattern
phone_pattern = r'\d{3}-\d{3}-\d{4}'
phones = re.findall(phone_pattern, text)
print("Phones:", phones)

# Word boundaries
words = re.findall(r'\b\w+\b', text)
print("Words:", words)

# Digits only
numbers = re.findall(r'\d+', text)
print("Numbers:", numbers)

Special Characters and Quantifiers

import re

text = "aa ab a123 a! @#$ hello123 world456 test_789"

# . (dot) - any character except newline
print(re.findall(r'a.', text))     # ['aa', 'ab', 'a1']

# * - zero or more
print(re.findall(r'a*', text))     # ['aa', '', '', 'a', '', '', '', '', '', '', '', '', '', '', '', '', '']

# + - one or more
print(re.findall(r'a+', text))     # ['aa', 'a']

# ? - zero or one
print(re.findall(r'ab?', text))    # ['a', 'ab']

# {n} - exactly n times
print(re.findall(r'\d{3}', text))  # ['123', '456', '789']

# {n,m} - between n and m times
print(re.findall(r'\d{2,3}', text)) # ['123', '456', '789']

# ^ - start of string
print(re.match(r'^aa', text))      # Match

# $ - end of string
print(re.search(r'789$', text))    # Match

Character Classes

import re

text = "Hello123 World456 Test_789 abcDEF"

# [abc] - any of a, b, or c
print(re.findall(r'[abc]', text))     # ['a', 'b', 'c']

# [a-z] - any lowercase letter
print(re.findall(r'[a-z]+', text))    # ['ello', 'orld', 'est', 'abc']

# [A-Z] - any uppercase letter
print(re.findall(r'[A-Z]+', text))    # ['H', 'W', 'T', 'DEF']

# [0-9] - any digit
print(re.findall(r'[0-9]+', text))    # ['123', '456', '789']

# [^abc] - any character except a, b, c
print(re.findall(r'[^a-zA-Z0-9]+', text))  # [' ', ' ', ' ', '_']

# \d - digit, \D - non-digit
print(re.findall(r'\d+', text))       # ['123', '456', '789']
print(re.findall(r'\D+', text))       # ['Hello', ' World', ' Test_', ' abcDEF']

# \w - word character, \W - non-word
print(re.findall(r'\w+', text))       # ['Hello123', 'World456', 'Test_789', 'abcDEF']
print(re.findall(r'\W+', text))       # [' ', ' ', ' ']

Groups and Capturing

import re

text = "John Doe: 123-456-7890, Jane Smith: 987-654-3210"

# Capturing groups
pattern = r'(\w+) (\w+): (\d{3}-\d{3}-\d{4})'
matches = re.findall(pattern, text)
print("Matches:", matches)
# [('John', 'Doe', '123-456-7890'), ('Jane', 'Smith', '987-654-3210')]

# Named groups
pattern_named = r'(?P<first>\w+) (?P<last>\w+): (?P<phone>\d{3}-\d{3}-\d{4})'
match = re.search(pattern_named, text)
if match:
    print("First name:", match.group('first'))
    print("Last name:", match.group('last'))
    print("Phone:", match.group('phone'))

# Non-capturing groups
pattern_nc = r'(?:\w+ )+\w+: \d{3}-\d{3}-\d{4}'
print(re.findall(pattern_nc, text))

Search and Replace

import re

text = "The price is $100.50 and the discount is $20.25"

# Replace all dollar amounts with euros
result = re.sub(r'\$([0-9]+\.[0-9]{2})', r'€\1', text)
print(result)  # The price is €100.50 and the discount is €20.25

# Replace with function
def celsius_to_fahrenheit(match):
    celsius = float(match.group(1))
    fahrenheit = celsius * 9/5 + 32
    return f"{fahrenheit:.1f}°F"

text = "Temperature: 25°C, 30°C, 15°C"
result = re.sub(r'(\d+)°C', celsius_to_fahrenheit, text)
print(result)  # Temperature: 77.0°F, 86.0°F, 59.0°F

# Replace with limit
text = "one two three four five"
result = re.sub(r'\w+', 'word', text, count=3)
print(result)  # word word word four five

Split and Join with Regex

import re

text = "apple, banana; orange: grape, peach"

# Split on multiple delimiters
fruits = re.split(r'[,;:]\s*', text)
print(fruits)  # ['apple', 'banana', 'orange', 'grape', 'peach']

# Split with capturing groups (keeps delimiters)
parts = re.split(r'([,;:])\s*', text)
print(parts)  # ['apple', ',', 'banana', ';', 'orange', ':', 'grape', ',', 'peach']

# Join with pattern
numbers = ['123', '456', '789']
formatted = '-'.join(numbers)
print(formatted)  # 123-456-789

# Format phone numbers
phones = ['1234567890', '9876543210']
formatted_phones = [re.sub(r'(\d{3})(\d{3})(\d{4})', r'(\1) \2-\3', phone) for phone in phones]
print(formatted_phones)  # ['(123) 456-7890', '(987) 654-3210']

Advanced Patterns

import re

# Lookahead and lookbehind
text = "password123, admin456, user789"

# Positive lookahead
admins = re.findall(r'\w+(?=456)', text)
print("Admins:", admins)  # ['admin']

# Negative lookahead
non_admins = re.findall(r'\w+(?!456)', text)
print("Non-admins:", non_admins)  # ['password123', 'user789']

# Positive lookbehind
after_pass = re.findall(r'(?<=password)\d+', text)
print("After password:", after_pass)  # ['123']

# Non-greedy matching
text = "<div>First</div><div>Second</div>"
matches = re.findall(r'<div>.*?</div>', text)
print("Non-greedy:", matches)  # ['<div>First</div>', '<div>Second</div>']

# Greedy matching
greedy = re.findall(r'<div>.*</div>', text)
print("Greedy:", greedy)  # ['<div>First</div><div>Second</div>']

Compiling Regular Expressions

For better performance with repeated use:

import re

# Compile pattern for reuse
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

text = """
John: john@example.com
Jane: jane@test.org
Bob: bob@company.net
"""

emails = email_pattern.findall(text)
print("Emails found:", emails)

# Using compiled pattern methods
for email in emails:
    if email_pattern.match(email):
        print(f"{email} is valid")
    else:
        print(f"{email} is invalid")

# Flags with compiled patterns
case_insensitive = re.compile(r'hello', re.IGNORECASE)
print(case_insensitive.findall("Hello HELLO hello"))  # ['Hello', 'HELLO', 'hello']

Common Use Cases

import re

# Validate email
def is_valid_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

# Extract URLs
def extract_urls(text):
    url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:\w*))?)?'
    return re.findall(url_pattern, text)

# Parse log files
log_line = "2023-12-01 10:30:45 ERROR Database connection failed"
timestamp_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
level_pattern = r'\b(ERROR|WARNING|INFO|DEBUG)\b'
message_pattern = r'(ERROR|WARNING|INFO|DEBUG)\s+(.+)$'

timestamp = re.search(timestamp_pattern, log_line).group(1)
level = re.search(level_pattern, log_line).group(0)
message = re.search(message_pattern, log_line).group(2)

print(f"Timestamp: {timestamp}")
print(f"Level: {level}")
print(f"Message: {message}")

# Format validation
def is_valid_phone(phone):
    # Accepts formats: (123) 456-7890, 123-456-7890, 123.456.7890
    pattern = r'^(\(\d{3}\)\s*|\d{3}[.-]?)\d{3}[.-]?\d{4}$'
    return re.match(pattern, phone) is not None

phones_to_test = ["(123) 456-7890", "123-456-7890", "123.456.7890", "1234567890"]
for phone in phones_to_test:
    print(f"{phone}: {is_valid_phone(phone)}")

Best Practices

  • Use raw strings for patterns: r"pattern" to avoid escape issues
  • Compile patterns for reuse: Better performance for repeated use
  • Use specific patterns: Avoid .* when possible
  • Test patterns thoroughly: Use online regex testers
  • Use verbose mode for complex patterns: re.VERBOSE flag
  • Handle exceptions: re.error for invalid patterns
  • Use non-greedy matching: .*? instead of .* when appropriate
  • Document complex patterns: Comments and examples
  • Consider alternatives: str methods for simple operations
  • Profile performance: Regex can be slow for large texts

Performance Tips

  • Avoid catastrophic backtracking: Use atomic groups or possessive quantifiers
  • Use anchors: ^ and $ to limit search scope
  • Pre-compile patterns: Especially in loops
  • Use appropriate flags: re.MULTILINE, re.DOTALL, etc.
  • Consider string methods: str.find(), str.split() for simple cases

Regular expressions are incredibly powerful for text processing, but they can be complex. Start with simple patterns and gradually build complexity. Always test your patterns thoroughly with various inputs.