Advanced Python
In-Class Exercise Solutions, Session 4
MATCHING |
|
Ex. 4.1 | Match a simple character pattern. |
Search for 'Velas', then try 'Benter' and 'Acme'. |
|
import re
lines = [
'Acme Corporation is heded by CEO Joseph Benter, and ',
'President Maria Velas. Mr. Benter focuses on R&D ',
'while Ms. Velas provides vision and major deals for ',
'Acme. '] # list of lists
for line in lines: # str, 'Acme Corporation is ... '
if re.search(r'Velas', line): # bool, False (first string)
print(line)
|
|
Ex. 4.2 | 'not' to negate a search. Execute previous pattern with 'not' in front of re.search() |
import re
lines = [
'Acme Corporation is heded by CEO Joseph Benter, and ',
'President Maria Velas. Mr. Benter focuses on R&D ',
'while Ms. Velas provides vision and major deals for ',
'Acme. '] # list of strings
for line in lines: # str, 'Acme Corporation ...'
if not re.search(r'Benter', line): # str, False (first line)
print(line)
|
|
ANCHORS |
|
Ex. 4.3 | Anchors - start of string. |
Print only those lines that have 'TEL' at the start: |
|
import re
for text_line in ['AURORA HOTEL',
'OPEN12:00 AM - 11:59 PM',
'14200 E ALAMEDA AVE AURORA, CO 80012',
'TEL (303) 344-9901']: # list of strings
if re.search(r'^TEL', text_line): # bool, False (first string)
print(text_line)
|
|
Ex. 4.4 | Anchors - end of string. |
Print only those files that end in .jpg |
|
import re
filenames = ['image.jpg', 'image.png', 'filejpg.txt', 'file2.doc',
'file3.pdf', 'image2.gif', 'image3.jpg', 'image4.jpg']
for name in filenames: # str, 'image.jpg'
if re.search(r'\.jpg$', name): # bool, True (first string)
print(name)
|
|
BUILT-IN CHARACTER CLASSES |
|
Ex. 4.5 | "Digit" character class. |
Match on each string that has a digit. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'Hello world 00'
if re.search(r'\d', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.6 | "Word" character class. |
Match each string that has a letter, number or underscore. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\w', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.7 | "Space" character class. |
Match on each line that has a space. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\s', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
INVERSE CHARACTER CLASSES |
|
Ex. 4.8 | "Not a digit" character class. |
Match on each string that has a character that is not a digit. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] #
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\D', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.9 | "Not a space" character class. |
Match on each string that has any non-spaces. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
' ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\S', string): # bool True
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
CUSTOM CHARACTER CLASSES |
|
Ex. 4.10 | Custom character class. |
Match on each string that has a capital letter in it. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[A-Z]', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.11 | Using custom character class with built-in character class. |
Match on each string that has a letter followed by a number. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[A-Za-z]\d', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
INVERSE CUSTOM CHARACTER CLASSES |
|
Ex. 4.12 | Inverse Custom Character Class. Match on each string that has any character that is not a letter. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[^a-zA-Z]', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.13 | Match on each string that ends with a character that is not a digit. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\D$', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
THE WILDCARD |
|
Ex. 4.14 | Demo: match on any character. |
Use the wildcard (., a period) to see which strings match it. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'.', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
LAB 1 |
|
Ex. 4.15 | Match on each string that starts with a digit. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^\d', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.16 | Match on each string that starts with a space. |
import re
lines = [ 'this is the first line,',
' and this is the second line and',
' this is the third line. ' ] # list of strings
for line in lines: # str, 'this is the first line,'
if re.search(r'^\s', line): # bool False
print(line)
|
|
Ex. 4.17 | Loop through and print only lines with some text (not including spaces). |
import re
text = """line 1
line 2,
line 3...
line4!""" # str
lines = text.splitlines() # list, ['line1', 'line2,', '', 'line3...' ...]
for line in lines: # str, line1
if re.search(r'\S', line): # bool True
print(line)
|
|
Ex. 4.18 | Match on each string that ends with a digit. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\d$', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.19 | Match on each line that ends with a space. |
import re
lines = [ 'this is the first line, ',
'this is the second line and',
'this is the third line. ' ] # list of strings
for line in lines: # str, 'this is the first line, '
if re.search(r'\s$', line): # bool, True (first string)
print(line)
|
|
Ex. 4.20 | Match on each string that consists only of a 2-digit number. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^\d\d$', string): # bool False
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.21 | Match on a capital letter followed by a lowercase letter. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[A-Z][a-z]', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.22 | Match on files with date format YYYY-MM-DD followed by '.txt'. |
import re
dirlist = ('.', '..', '2010-12-15.txt', '2010-12-16.txt',
'testfile.txt', '20101-11-03.txt') # tuple, ('.', '..', ...)
for item in dirlist: # str, '.'
if re.search(r'^\d\d\d\d\-\d\d\-\d\d\.txt$', item): # bool, False (first string)
print(item)
|
|
Ex. 4.23 | Match on date format MM/DD/YY (and not 4-digit year). |
import re
dates = ['Jan. 3, 2018', '23-Mar-17', '12/02/98',
'12/03/1998', '23.17.2018'] # list of strs
for date in dates: # str, 'Jan. 3, 2018 '
if re.search(r'^\d\d\/\d\d\/\d\d$', date): # bool, False (first string)
print(date)
|
|
Ex. 4.24 | Determine whether selected word begins with a vowel. If so, prepend an 'an' rather an an 'a'. |
import re
words = ['apple', 'pear', 'orange', 'kiwi', 'elderberry', 'carrot', 'ugli fruit']#
for word in words: # str, 'apple'
if re.search(r'^[aeiou]', word): # bool, True (first string)
prepend = 'an' # str, 'an'
else:
prepend = 'a' # str, 'a'
print(f"{prepend} {word}")
|
|
BUILT-IN QUANTIFIERS |
|
Ex. 4.25 | "One or more" quantifier. Match on each string that has one or more letters in it. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[a-zA-Z]+', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.26 | "Zero or one" quantifier. |
Without using a character class (or grouped alternates), use a single regex that matches on each string that has 'a' or 'an' followed by a space. |
|
import re
lines = [
'This is a wonderful thing. ',
"I haven't seen anything like it. ",
"Isn't it an exceptional experience? "] # list of strs
for line in lines: # str, 'This is a wonderful thing. '
if re.search(r'an?\s', line): # bool, True (first string)
print(line)
|
|
Ex. 4.27 | "Zero or more" quantifier, quantifiers with anchor. |
Match on all strings that consist only of a 1 followed by zero or more digits. |
|
import re
numbers = [
'100',
'135',
'31',
'1',
'1 think',
] # list of strs
for val in numbers: # str, '100'
if re.search(r'^1\d*$', val): # bool, True (first string)
print(val)
|
|
Ex. 4.28 | Quantifiers with Anchor. Match on each string that consists only of one or more digit characters. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^\d+$', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.29 | Quantifiers with Anchor (3). Match on each string that consists only of letters. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^[a-zA-Z]+$', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.30 | Quantifiers with custom character class. |
Match each string that has a capital letter followed by one or more lowercase letters. |
|
import re
match_strings = [
'hello World 00',
'goodbye C world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[A-Z][a-z]+', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.31 | Quantifiers with anchors. Match on each string that consists only of letters, numbers or the underscore. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^\w+$', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.32 | Quantifiers with anchors (2). Match on each string that consists only of non-digits. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^\D+$', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.33 | Quantifiers with anchors (3). Match on each string that consists only of non-spaces. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'^\S+$', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
CUSTOM QUANTIFIERS |
|
Ex. 4.34 | Custom quantifier. |
Match on each string that has two or more spaces at the end. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\s{2,}$', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.35 | Custom quantifier. |
Match on strings that have a capital letter followed by two or more lowercase letters. |
|
import re
match_strings = [
'hello World 00',
'goodbye As world ',
'To 23 bonjour',
'wilkommen23 ',
'Aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'[A-Z][a-z]{2,}', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.36 | Custom quantifier. |
Print those numbers that are in the millions (i.e., 7 or more digits). |
|
import re
nums = [
'1',
'10',
'100',
'1000',
'10000',
'100000',
'1000000',
'10000000'
] # list of strs
for num in nums: # str, '1'
if re.search(r'\d{7,}', num): # bool, False (first string)
print(num)
|
|
Ex. 4.37 | Custom quantifier. Having split the text into words, show those words that are greater than 7 characters in size. |
import re
text = """This is the 1000th story, regarding a duck
named Quack. It was unlikely that Quack could have been
given a name like that by his mother, so we can only conclude
that he was named by the author, who has a cuteness problem.""" # str
words = text.split() # list, ['This', 'is', ... ]
stripped = [ word.rstrip('.,') for word in words ] # list, ['This', 'is', ... ]
for word in stripped: # str, 'This'
if re.search(r'\w{7,}', word): # bool, False (first string)
print(word)
|
|
Ex. 4.38 | Custom Quantifier. |
A password must be 3-8 characters in length (letters, numbers and underscores are permitted). Validate the below password attempts. |
|
import re
attempts = [
'1234',
'hello_there',
'password',
'ok',
'what?',
'supercalifrag'] # list of strings
for password in attempts: # str, '1234'
if re.search(r'^\w{3,8}$', password): # bool, True (first string)
print(f'{password}: validated')
|
|
ESCAPING SPECIAL CHARACTERS |
|
Ex. 4.39 | Escape wildcard (aka period). Match on each string that has a letter, number or underscore followed by a period. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\w\.', string): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.40 | Escape end anchor (aka dollar sign). |
Match on strings that have a dollar amount, including two decimal places ($23.53). |
|
import re
lines = [
'The coat cost $239.50.',
'The candy cost $1.93',
"I didn't buy anything today.",
'$1 sale',
'I dream of $$$'
] # list of strings
for line in lines: # str, 'The coat cost $239.50'
if re.search(r'\$\d+\.\d+', line): # bool, True (first string)
print(line)
|
|
Ex. 4.41 | Escape quantifier character +. |
Match on all lines with positive numbers. |
|
import re
numbers = [
'Amount: -23.9',
'Amount: +43.8',
'Amount: -9.03',
'Amount: +99.9',
'Amount: +22.0'
] # list of strings
for num in numbers: # str, 'Amount: -23.9'
if re.search(r'\+\d+\.\d+', num): # bool, False (first string)
print(num)
|
|
Ex. 4.42 | Escape quantifier character *. |
Match on all lines with asterisked footnotes. |
|
import re
numbers = [
'As Ibid* said,',
'there should be no greater good ',
'than compassion*, love, ',
'mutual benefit*',
'and the profit-making motive.',
] # list of strings
for num in numbers: # str, 'As Ibid* said,'
if re.search(r'\*', num): # bool, True (first string)
print(num)
|
|
LAB 2 |
|
Ex. 4.43 | Match on each string that has one or more "word" characters, followed by one or more spaces, followed by one or more "word" characters. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\w+\s+\w+', string): # bool, True (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.44 | Ignore comment lines: print only those lines that don't start with a comment (the first non-space character is a hash mark). |
import re
text = """
# this is a program to do stuff
a = 5
b = 10 # an int
if True:
# multiply them
c = a * b
"""
for line in text.splitlines():
if not re.search(r'^\s*#', line):
print(line)
|
|
Ex. 4.45 | Match those lines that contain a 7-digit hex number (a-fA-F0-9). |
import re
lines = [
'The color code is #ABF2307.',
'Mr. Mxyzptlk is 999 years old today.',
'The memory address is fc9d223.'
] # list of strings
for line in lines: # str, 'The color code is #ABF2307.'
if re.search(r'[a-fA-F0-9]{7}', line): # bool, True (first string)
print(line)
|
|
Ex. 4.46 | Show those lines that contain two capitalized words (as in a name). |
import re
lines = [
'The owner is Gwen Harstridge.',
"There aren't a lot of stores like this one.",
'Paris is not a lot like Rome.',
'I hail from Los Angeles, California.'
] # list of strings
for line in lines: # str, 'The owner is Gwen Harstridge.'
if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', line): # bool, True (first string)
print(line)
|
|
re.IGNORECASE |
|
Ex. 4.47 | Without using a character class, match on each string that ends in .jpg or .JPG (try this another way). |
(hint: use the flag argument (the optional 3rd argument) to re.search()) |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\.jpg$', string, re.I): # bool, False (first string)
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
Ex. 4.48 | Print only those files that start with 'image#' ('image' plus a possible number) and end in any of these image extensions: '.jpg', '.png', '.gif' |
import re
filenames = ['image2.jpg', 'image.png', 'file.txt', 'file2.doc',#
'file3.pdf', 'image2.gif', 'image3.jpg', 'image4.jpg',#
'advert.jpg', 'advert.png'] # list of strings
for name in filenames: # str, 'image2.jpg'
if re.search(r'image\d?\.(jpg|png|gif)$', name): # bool, True (first string)
print(name)
|
|
Ex. 4.49 | Match on each string that ends in .jpg or .JPG |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
] # list of strings
count = 0 # int, 0
for string in match_strings: # str, 'hello world 00'
if re.search(r'\.(jpg|JPG)', string): # bool, False (first string)
# if re.search(r'\.JPG', string, re.I): # alternative
print(string)
count += 1 # int, 1
print(f'count: {count}')
|
|
GROUPING FOR QUANTIFYING and ALTERNATES |
|
Ex. 4.50 | Quantifying a group. Match on a number with two decimal places and possible thousandths separator (3.95, 3,200.95, etc.) |
First create a pattern that is 1 or more digits with comma separator (i.e. matching on 0,, 00, 000,) and group the number with parentheses; quantify the group to say that there is zero or more of these, followed by one or more digits, a period and 2 digits. (Do not use a custom character class for this purpose.) |
|
import re
values = ['23.9', '18.2', '23.95', '2,238,000.00', '15,382.92', 'joe', '6.05'] # list of str
for value in values: # str, '23.9'
matchobj = re.search(r'^(\d+,)*\d+\.\d\d$', value) # re.Match object
if matchobj: # bool, True
print(value)
|
|
Ex. 4.51 | Quantifying a Group (2). Write a single regex that matches on q, Q, quit, Quit, QUIT. Do this without a character class and without the alternate vertical bar. |
import re
x = input('Do you want to quit? ') # str, 'QuIt' (sample input)
if re.search(r'q(uit)?', x, re.I): # bool, True
print("you're quitting!")
else:
print("you failed to quit.")
|
|
GROUPING FOR EXTRACTION |
|
Ex. 4.52 | Group for extraction. |
Use a parenthetical grouping to extract the number from this text. |
|
import re
line = '34: this is a line of text' # str, '34: this is a line of text'
matchobj = re.search(r'(\d+)', line) # re.Match object
print(matchobj.group(1)) # str, '34'
|
|
Ex. 4.53 | Group for extraction. Extract the Catalog ID and Publication Date from the text line. |
import re
rs_row = 'Catalog ID: 2839-587 Pub. Date: 2019-09-03' # str, 'Catalog ID...'
matchobj = re.search(r'Catalog\s+ID:\s+(\d+\-\d+)\s+Pub.\s+Date:\s+(\d\d\d\d\-\d\d\-\d\d)', rs_row) # re.Match object
if matchobj: # bool, True
print(matchobj.group(1)) # str, '2839-587'
print(matchobj.group(2)) # str, '2019-09-03'
|
|
Ex. 4.54 | Group for extraction. In one regex match, extract the IP address from this log line. |
import re
line = '172.26.93.208 - - [28/Jun/2012:21:00:17 -0400] "GET /~cmk380/pythondata/image2b.txt HTTP/1.1" 200 30' #
matchobj = re.search(r'^(\d{2,3}\.\d{2,3}\.\d{2,3}\.\d{2,3})', line) # re.Match object
if matchobj: # bool, True
print(matchobj.group(1)) # str, '172.26.93.208'
|
|
'MINIMAL MATCH' QUANTIFIER |
|
Ex. 4.55 | Demonstration: "minimal" match. |
The below regex grabs the word Python from the text. Run the code once to observe this. Now add a question mark ? as the character directly after the "one or more" plus sign and run again - you should see that the "one or more word characters" pattern is now matching on as few characters as possible. |
|
import re
text = 'My language is Python' # str
matchobj = re.search(r'My language is (\w+)', text) # re.Match object
print(matchobj.group(1)) # str, 'Python'
|
|
Ex. 4.56 | Work with wildcard and minimal match. |
Use the wildcard to match everything between the first two brackets. Note carefully what was printed. |
|
import re
text = 'Discussion of terms <TO COME> after something <PLEASE REVIEW>.'#
matchobj = re.search(r'(<.+?>)', text) # re.Match object
print(matchobj.group(1)) # str, '<TO COME>'
|
|
Ex. 4.57 | Match on non-search character. |
Perform the same extraction on the below text by searching for a bracket followed by one or more non-brackets. Text extracted should be the same. |
|
import re
text = 'Discussion of terms <TO COME> after something <PLEASE REVIEW>.'#
matchobj = re.search(r'<([^>]+)>', text) # re.Match object
print(matchobj.group(1)) # str, 'TO COME'
|
|
GROUPING with .groups() |
|
Ex. 4.58 | Retrieve a grouping with .groups(). |
In one regex match, extract the status code and bytes downloaded (last 2 integers on the line) from this log line. Call .groups() the match object to reveal the extracted values. |
|
import re
line = '172.26.93.208 - - [28/Jun/2012:21:00:17 -0400] "GET /~cmk380/pythondata/image2b.txt HTTP/1.1" 200 30'
# str, '172.25.93.208 - - ...'
matchobj = re.search(r'(\d+)\s+(\d+)$', line) # re.Match object
if matchobj: # bool, True
print(matchobj.groups()) # list, ['200', '30']
|
|
Ex. 4.59 | Retrieve a grouping with .groups(). |
Extract city, state zip from line. |
|
import re
line = 'Los Angeles, CA 91604' # str, 'Los Angeles ...'
matchobj = re.search(r'([^,]+),\s+([A-Z]{2})\s+(\d+)', line) # re.Match object
print(matchobj.groups()) # list, ['Los Angeles', 'CA', '91604']
|
|
Ex. 4.60 | Quantify for an optional group. |
Pull out all the info about each person (Favorite Color may not be there). |
|
import re
results = [ 'Name: Joe; Favorite Color: Blue; Employee ID: 2395',
'Name: Marie; Employee ID: 2321',
'Name: Teneski; Favorite Color: Green; Employee ID: 1913' ] # list of strings
for row in results: # str, 'Name: Joe; Favorite ...'
matchobj = re.search(r'^Name:\s+([A-Za-z]+);\s+(Favorite Color:\s+([A-Za-z]+);\s+)?Employee ID:\s+(\d+)', row)
# re.Match object
print(matchobj.groups()) # list, ['Joe', 'Blue', '2395']
|
|
findall() FOR MULTIPLE MATCHES |
|
Ex. 4.61 | Group and extract with findall(). |
Extract email addresses only for nyu.edu. |
|
import re
text = """There are many ways to contact us. Use the
general email contact@nyu.edu, or email our public
liason at help@nyu.edu. If you need tech support you
can reach us at askits@nyu.edu.
Author: Joe Wilson joe@wilson.com""" # str
emails = re.findall(r'[a-z]+@nyu.edu', text) # list, ['contact@nyu.edu', 'help@nyu.edu' ... ]
print(emails)
|
|
re.sub() FOR SUBSTITUTIONS |
|
Ex. 4.62 | Regex substitution. Replace space-separated with comma separated |
import re
args = 'this that other and some other' # str, 'this that other ... '
args2 = re.sub(r'\s+', ",", args) # str, 'this,that,other...'
print(args2)
|
|
re.split() FOR PATTERN-BASED DELIMITERS |
|
Ex. 4.63 | Regex split. Split the user-input comma-separated values string into separate digit values. |
import re
ui = '23, 14, 7,3,9' # str, '23, 14, 7...'
numbers = re.split(r',\s*', ui) # list, ['23', '14', '7', '3', '9']
print(numbers)
|
|
Ex. 4.64 | DOTALL wildcard match. |
Extract everything between =code start= and = code end =. Use the re.DOTALL switch to use the wildcard (.) to match on a newline. |
|
import re
text = """Title of This Text
This is some description...
=code start=
a = 5
b = 5.0
if a == b:
print('yes')
=code end=
This is some discussion...
""" # str
matchobj = re.search(r'=code start=(.+)=code end=', text, re.DOTALL)
# re.Match object
print(matchobj.group(1)) # str, 'a = 5\nb = 5.0\nif a ...'
|
|
Ex. 4.65 | Multiline anchors. |
Use findall() to extra numbers from only the start of each line of the text. Use re.MULTILINE to allow the carrot (^) to match at the start of any line. |
|
import re
text = """Title of This Text
23 we want to grab some 99 numbers
12 but not others, 17 and then some
5 so we just get 1 the ones
on the left side
93 and me and 23 too
""" # str
matches = re.findall(r'^\d+', text, re.MULTILINE)
# ['23', '12', '5', '93']
print(matches)
|
|