Advanced Python
In-Class Exercises, Session 4
MATCHING |
|
Ex. 4.1 | Match a simple character pattern. |
Search for 'Velas', then try 'Benter' and 'Acme'. |
|
import re
lines = [
'Acme Corporation is heded by CEO Joseph Benter, and ',
'President Maria Velas. Mr. Benter focuses on R&D ',
'while Ms. Velas provides vision and major deals for ',
'Acme. ']
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output (for Velas): |
|
President Maria Velas. Mr. Benter focuses on R&D while Ms. Velas provides vision and major deals for |
|
Ex. 4.2 | 'not' to negate a search. Execute previous pattern with 'not' in front of re.search() |
import re
lines = [
'Acme Corporation is heded by CEO Joseph Benter, and ',
'President Maria Velas. Mr. Benter focuses on R&D ',
'while Ms. Velas provides vision and major deals for ',
'Acme. ']
for line in lines:
if re.search(r'Benter', line):
print(line)
|
|
Expected Output (for Benter): |
|
while Ms. Velas provides vision and major deals for Acme. |
|
ANCHORS |
|
Ex. 4.3 | Anchors - start of string. |
Print only those lines that have 'TEL' at the start: |
|
import re
for text_line in ['AURORA HOTEL',
'OPEN12:00 AM - 11:59 PM',
'14200 E ALAMEDA AVE AURORA, CO 80012',
'TEL (303) 344-9901']:
if re.search(r'', text_line):
print(text_line)
|
|
Expected Output: |
|
TEL (303) 344-9901 |
|
Ex. 4.4 | Anchors - end of string. |
Print only those files that end in .jpg |
|
import re
filenames = ['image.jpg', 'image.png', 'filejpg.txt', 'file2.doc',
'file3.pdf', 'image2.gif', 'image3.jpg', 'image4.jpg']
for name in filenames:
if re.search(r'', name):
print(name)
|
|
Expected Output: |
|
image.jpg image3.jpg image4.jpg |
|
BUILT-IN CHARACTER CLASSES |
|
Ex. 4.5 | "Digit" character class. |
Match on each string that has a digit. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 23 bonjour wilkommen23 99 00 88557799 Que 3 Tal! count: 7 |
|
Ex. 4.6 | "Word" character class. |
Match each string that has a letter, number or underscore. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
Ex. 4.7 | "Space" character class. |
Match on each line that has a space. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
INVERSE CHARACTER CLASSES |
|
Ex. 4.8 | "Not a digit" character class. |
Match on each string that has a character that is not a digit. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 goodbye world 23 bonjour wilkommen23 aloha Que 3 Tal! myfile.jpg yourfile.JPG count: 8 |
|
Ex. 4.9 | "Not a space" character class. |
Match on each string that has any non-spaces. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
' ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 goodbye world 23 bonjour wilkommen23 aloha 99 00 88557799 Que 3 Tal! myfile.jpg yourfile.JPG count: 11 |
|
CUSTOM CHARACTER CLASSES |
|
Ex. 4.10 | Custom character class. |
Match on each string that has a capital letter in it. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
Que 3 Tal! yourfile.JPG count: 2 |
|
Ex. 4.11 | Using custom character class with built-in character class. |
Match on each string that has a letter followed by a number. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
wilkommen23 count: 1 |
|
INVERSE CUSTOM CHARACTER CLASSES |
|
Ex. 4.12 | Inverse Custom Character Class. Match on each string that has any character that is not a letter. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 goodbye world 23 bonjour wilkommen23 99 00 88557799 Que 3 Tal! myfile.jpg yourfile.JPG count: 10 |
|
THE WILDCARD |
|
Ex. 4.13 | Match on each string that ends with a character that is not a digit. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
goodbye world 23 bonjour wilkommen23 aloha Que 3 Tal! myfile.jpg yourfile.JPG count: 7 |
|
Ex. 4.14 | Demo: match on any character. |
Use the wildcard (., a period) to see which strings match it. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
LAB 1 |
|
Ex. 4.15 | Match on each string that starts with a digit. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
99 00 88557799 count: 3 |
|
Ex. 4.16 | Match on each string that starts with a space. |
import re
lines = [ 'this is the first line,',
' and this is the second line and',
' this is the third line. ' ]
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
and this is the second line and', this is the third line. ' ] |
|
Ex. 4.17 | Loop through and print only lines with some text (not including spaces). |
import re
text = """line 1
line 2,
line 3...
line4!"""
lines = text.splitlines()
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
line 1 line 2, line 3... line4! |
|
Ex. 4.18 | Match on each string that ends with a digit. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 99 00 88557799 count: 4 |
|
Ex. 4.19 | Match on each line that ends with a space. |
import re
lines = [ 'this is the first line, ',
'this is the second line and',
'this is the third line. ' ]
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
this is the first line, this is the third line. |
|
Ex. 4.20 | Match on each string that consists only of a 2-digit number. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
99 00 count: 2 |
|
Ex. 4.21 | Match on a capital letter followed by a lowercase letter. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
Que 3 Tal! |
|
Ex. 4.22 | Match on files with date format YYYY-MM-DD followed by '.txt'. |
import re
dirlist = ('.', '..', '2010-12-15.txt', '2010-12-16.txt',
'testfile.txt', '20101-11-03.txt')
for item in dirlist:
if re.search(r'', item):
print(item)
|
|
Expected Output: |
|
2010-12-15.txt 2010-12-16.txt |
|
Ex. 4.23 | Match on date format MM/DD/YY (and not 4-digit year). |
import re
dates = ['Jan. 3, 2018', '23-Mar-17', '12/02/98', '12/03/1998', '23.17.2018']
for date in dates:
if re.search(r'', date):
print(date)
|
|
Expected Output: |
|
12/02/98 |
|
Ex. 4.24 | Determine whether selected word begins with a vowel. If so, prepend an 'an' rather an an 'a'. |
import re
words = ['apple', 'pear', 'orange', 'kiwi', 'elderberry', 'carrot', 'ugli fruit']
for word in words:
if re.search(r'', word):
prepend = 'an'
else:
prepend = 'a'
print(f"{prepend} {word}")
|
|
Expected Output: |
|
an apple a pear an orange a kiwi an elderberry a carrot an ugli fruit |
|
BUILT-IN QUANTIFIERS |
|
Ex. 4.25 | "One or more" quantifier. Match on each string that has one or more letters in it. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 goodbye world 23 bonjour wilkommen23 aloha Que 3 Tal! myfile.jpg yourfile.JPG count: 8 |
|
Ex. 4.26 | "Zero or one" quantifier. |
Without using a character class (or grouped alternates), use a single regex that matches on each string that has 'a' or 'an' followed by a space. |
|
import re
lines = [
'This is a wonderful thing. ',
"I haven't seen anything like it. ",
"Isn't it an exceptional experience? "]
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
This is a wonderful thing. Isn't it an exceptional experience? |
|
Ex. 4.27 | "Zero or more" quantifier, quantifiers with anchor. |
Match on all strings that consist only of a 1 followed by zero or more digits. |
|
import re
numbers = [
'100',
'135',
'31',
'1',
'I think',
]
for val in numbers:
if re.search(r'', val):
print(val)
|
|
Expected Output: |
|
100 135 1 |
|
Ex. 4.28 | Quantifiers with Anchor. Match on each string that consists only of one or more digit characters. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
99 00 88557799 count: 3 |
|
Ex. 4.29 | Quantifiers with Anchor (3). Match on each string that consists only of letters. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
aloha count: 1 |
|
Ex. 4.30 | Quantifiers with custom character class. |
Match each string that has a capital letter followed by one or more lowercase letters. |
|
import re
match_strings = [
'hello World 00',
'goodbye C world ',
' 23 bonjour',
'wilkommen23 ',
'Aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello World 00 Aloha Que 3 Tal! count: 3 |
|
Ex. 4.31 | Quantifiers with anchors. Match on each string that consists only of letters, numbers or the underscore. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
aloha 99 00 88557799 count: 4 |
|
Ex. 4.32 | Quantifiers with anchors (2). Match on each string that consists only of non-digits. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
goodbye world aloha myfile.jpg yourfile.JPG count: 4 |
|
Ex. 4.33 | Quantifiers with anchors (3). Match on each string that consists only of non-spaces. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
aloha 99 00 88557799 myfile.jpg yourfile.JPG count: 6 |
|
CUSTOM QUANTIFIERS |
|
Ex. 4.34 | Custom quantifier. |
Match on each string that has two or more spaces at the end. |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
goodbye world wilkommen23 count: 2 |
|
Ex. 4.35 | Custom quantifier. |
Match on strings that have a capital letter followed by two or more lowercase letters. |
|
import re
match_strings = [
'hello World 00',
'goodbye As world ',
'To 23 bonjour',
'wilkommen23 ',
'Aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello World 00 Aloha Que 3 Tal! count: 3 |
|
Ex. 4.36 | Custom quantifier. |
Print those numbers that are in the millions (i.e., 7 or more digits). |
|
import re
nums = [
'1',
'10',
'100',
'1000',
'10000',
'100000',
'1000000',
'10000000'
]
for num in nums:
if re.search(r'', num):
print(num)
|
|
Expected Output: |
|
1000000 10000000 |
|
Ex. 4.37 | Custom quantifier. Having split the text into words, show those words that are greater than 7 characters in size. |
import re
text = """This is the 1000th story, regarding a duck
named Quack. It was unlikely that Quack could have been
given a name like that by his mother, so we can only conclude
that he was named by the author, who has a cuteness problem."""
words = text.split()
stripped = [ word.rstrip('.,') for word in words ]
for word in stripped:
if re.search(r'', word):
print(word)
|
|
Expected Output: |
|
regarding unlikely conclude cuteness problem |
|
Ex. 4.38 | Custom Quantifier. |
A password must be 3-8 characters in length (letters, numbers and underscores are permitted). Validate the below password attempts. |
|
import re
attempts = [
'1234',
'hello_there',
'password',
'ok',
'what?',
'supercalifrag']
for password in attempts:
if re.search(r'', password):
print(f'{password}: validated')
|
|
Expected Output: |
|
1234: validated password: validated |
|
ESCAPING SPECIAL CHARACTERS |
|
Ex. 4.39 | Escape wildcard (aka period). Match on each string that has a letter, number or underscore followed by a period. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
myfile.jpg yourfile.JPG count: 2 |
|
Note: why would this work without escaping the period? Because |
|
Ex. 4.40 | Escape end anchor (aka dollar sign). |
Match on strings that have a dollar amount, including two decimal places ($23.53). |
|
import re
lines = [
'The coat cost $239.50.',
'The candy cost $1.93',
"I didn't buy anything today.",
'$1 sale',
'I dream of $$$'
]
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
The coat cost $239.50. The candy cost $1.93 |
|
Ex. 4.41 | Escape quantifier character +. |
Match on all lines with positive numbers. |
|
import re
numbers = [
'Amount: -23.9',
'Amount: +43.8',
'Amount: -9.03',
'Amount: +99.9',
'Amount: +22.0'
]
for num in numbers:
if re.search(r'', num):
print(num)
|
|
Expected Output: |
|
Amount: +43.8 Amount: +99.9 Amount: +22.0 |
|
Ex. 4.42 | Escape quantifier character *. |
Match on all lines with asterisked footnotes. |
|
import re
numbers = [
'As Ibid* said,',
'there should be no greater good ',
'than compassion*, love, ',
'mutual benefit*',
'and the profit-making motive.',
]
for num in numbers:
if re.search(r'', num):
print(num)
|
|
Expected Output: |
|
As Ibid* said, than compasssion*, love, mutual benefit* |
|
LAB 2 |
|
Ex. 4.43 | Match on each string that has one or more "word" characters, followed by one or more spaces, followed by one or more "word" characters. |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
hello world 00 goodbye world 23 bonjour Que 3 Tal! count: 4 |
|
Ex. 4.44 | Ignore comment lines: print only those lines that don't start with a comment (the first non-space character is a hash mark). |
import re
text = """
# this is a program to do stuff
a = 5
b = 10 # an int
if True:
# multiply them
c = a * b
"""
for line in text.splitlines():
if not re.search(r'', line):
print(line)
|
|
Expected Output: |
|
a = 5 b = 10 # an int if True: c = a * b |
|
Ex. 4.45 | Match those lines that contain a 7-digit hex number (a-fA-F0-9). |
import re
lines = [
'The color code is #ABF2307.',
'Mr. Mxyzptlk is 999 years old today.',
'The memory address is fc9d223.'
]
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
The color code is #ABF2307. The memory address is fc9d223. |
|
Ex. 4.46 | Show those lines that contain two capitalized words (as in a name). |
import re
lines = [
'The owner is Gwen Harstridge.',
"There aren't a lot of stores like this one.",
'Paris is not a lot like Rome.',
'I hail from Los Angeles, California.'
]
for line in lines:
if re.search(r'', line):
print(line)
|
|
Expected Output: |
|
The owner is Gwen Harstridge. I hail from Los Angeles, California. |
|
re.IGNORECASE |
|
Ex. 4.47 | Without using a character class, match on each string that ends in .jpg or .JPG (try this another way). |
(hint: use the flag argument (the optional 3rd argument) to re.search()) |
|
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
myfile.jpg yourfile.JPG count: 2 |
|
Ex. 4.48 | Print only those files that start with 'image#' ('image' plus a possible number) and end in any of these image extensions: '.jpg', '.png', '.gif' |
import re
filenames = ['image2.jpg', 'image.png', 'file.txt', 'file2.doc',
'file3.pdf', 'image2.gif', 'image3.jpg', 'image4.jpg',
'advert.jpg', 'advert.png']
for name in filenames:
if re.search(r'', name):
print(name)
|
|
Expected Output: |
|
image2.jpg image.png image2.gif image3.jpg image4.jpg |
|
Ex. 4.49 | Match on each string that ends in .jpg or .JPG |
import re
match_strings = [
'hello world 00',
'goodbye world ',
' 23 bonjour',
'wilkommen23 ',
'aloha',
'99',
'00',
'88557799',
'Que 3 Tal!',
'myfile.jpg',
'yourfile.JPG'
]
count = 0
for string in match_strings:
if re.search(r'', string):
print(string)
count += 1
print(f'count: {count}')
|
|
Expected Output: |
|
myfile.jpg yourfile.JPG count: 2 |
|
GROUPING FOR QUANTIFYING and ALTERNATES |
|
Ex. 4.50 | Quantifying a group. Match on a number with two decimal places and possible thousandths separator (3.95, 3,200.95, etc.) |
First create a pattern that is 1 or more digits with comma separator (i.e. matching on 0,, 00, 000,) and group the number with parentheses; quantify the group to say that there is zero or more of these, followed by one or more digits, a period and 2 digits. (Do not use a custom character class for this purpose.) |
|
import re
values = ['23.9', '18.2', '23.95', '2,238,000.00', '15,382.92', 'joe', '6.05'] # list of str
for value in values:
matchobj = re.search(r'', value)
if matchobj:
print(value)
|
|
Expected Output: |
|
23.95 2,238,000.00 15,382.92 |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.51 | Quantifying a Group (2). Write a single regex that matches on q, Q, quit, Quit, QUIT. Do this without a character class and without the alternate vertical bar. |
import re
x = input('Do you want to quit? ') # str, 'QuIt' (sample input)
if re.search(r'', x):
print("you're quitting!")
else:
print("you failed to quit.")
|
|
Expected Output: |
|
Do you want to quit? QuIt you're quitting! |
|
GROUPING FOR EXTRACTION |
|
Ex. 4.52 | Group for extraction. |
Use a parenthetical grouping to extract the number from this text. |
|
import re
line = '34: this is a line of text'
matchobj = re.search(r'', line)
print(matchobj.group(1))
|
|
Expected Output: |
|
34 |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.53 | Group for extraction. Extract the Catalog ID and Publication Date from the text line. |
import re
rs_row = 'Catalog ID: 2839-587 Pub. Date: 2019-09-03'
matchobj = re.search(r'', rs_row)
if matchobj:
print(matchobj.group(1))
print(matchobj.group(2))
|
|
Expected Output: |
|
2839-587 2019-09-03 |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.54 | Group for extraction. In one regex match, extract the IP address from this log line. |
import re
line = '172.26.93.208 - - [28/Jun/2012:21:00:17 -0400] "GET /~cmk380/pythondata/image2b.txt HTTP/1.1" 200 30'
matchobj = re.search(r'', line)
if matchobj:
print(matchobj.group(1))
|
|
Expected Output: |
|
172.26.93.208 |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
'MINIMAL MATCH' QUANTIFIER |
|
Ex. 4.55 | Demonstration: "minimal" match. |
The below regex grabs the word Python from the text. Run the code once to observe this. Now add a question mark ? as the character directly after the "one or more" plus sign and run again - you should see that the "one or more word characters" pattern is now matching on as few characters as possible. |
|
import re
text = 'My language is Python'
matchobj = re.search(r'', text)
print(matchobj.group(1))
|
|
Expected Output: |
|
P |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.56 | Work with wildcard and minimal match. |
Use the wildcard to match everything between the first two brackets. Note carefully what was printed. (Don't forget that square brackets must be escaped with a backslash, and that extraction requires grouping parentheses.) |
|
import re
text = 'Discussion of terms <TO COME> after something <PLEASE REVIEW>.'
matchobj = re.search(r'', text)
print(matchobj.group(1))
|
|
Expected Output: |
|
TO COME |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.57 | Match on non-search character. |
Perform the same extraction on the below text by searching for a bracket followed by one or more non-brackets. Text extracted should be the same. |
|
import re
text = 'Discussion of terms <TO COME> after something <PLEASE REVIEW>.'
matchobj = re.search(r'', text)
print(matchobj.group(1))
|
|
Expected Output: |
|
TO COME |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
GROUPING with .groups() |
|
Ex. 4.58 | Retrieve a grouping with .groups(). |
In one regex match, extract the status code and bytes downloaded (last 2 integers on the line) from this log line. Call .groups() the match object to reveal the extracted values. |
|
import re
line = '172.26.93.208 - - [28/Jun/2012:21:00:17 -0400] "GET /~cmk380/pythondata/image2b.txt HTTP/1.1" 200 30'
matchobj = re.search(r'', line)
if matchobj:
print(matchobj.groups())
|
|
Expected Output: |
|
('200', '30') |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.59 | Retrieve a grouping with .groups(). |
Extract city, state zip from line. |
|
import re
line = 'Los Angeles, CA 91604'
matchobj = re.search(r'', line)
print(matchobj.groups())
|
|
Expected Output: |
|
('Los Angeles', 'CA', '91604') |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
Ex. 4.60 | Quantify for an optional group. |
Pull out all the info about each person (Favorite Color may not be there). |
|
import re
results = [ 'Name: Joe; Favorite Color: Blue; Employee ID: 2395',
'Name: Marie; Employee ID: 2321',
'Name: Teneski; Favorite Color: Green; Employee ID: 1913' ]
for row in results:
matchobj = re.search(r'', row)
print(matchobj.groups())
|
|
Expected Output: |
|
('Joe', 'Favorite Color: Blue; ', 'Blue', '2395') ('Marie', None, None, '2321') ('Teneski', 'Favorite Color: Green; ', 'Green', '1913') |
|
Note that if you see the message AttributeError: 'NoneType' object has no attribute 'group', this means that the search did not find a match and returned None, and the code attempted to call .group() on None. Check the string and pattern to determine why it did not match. |
|
findall() FOR MULTIPLE MATCHES |
|
Ex. 4.61 | Group and extract with findall(). |
Extract email addresses only for nyu.edu. |
|
import re
text = """There are many ways to contact us. Use the
general email contact@nyu.edu, or email our public
liason at help@nyu.edu. If you need tech support you
can reach us at askits@nyu.edu.
Author: Joe Wilson joe@wilson.com"""
emails = re.findall(r'', text)
print(emails)
|
|
Expected Output: |
|
['contact@nyu.edu', 'help@nyu.edu', 'askits@nyu.edu'] |
|
re.sub() FOR SUBSTITUTIONS |
|
Ex. 4.62 | Regex substitution. Replace space-separated with comma separated |
import re
args = 'this that other and some other'
args2 = re.sub(r'', ",", args)
print(args2)
|
|
Expected Output: |
|
this,that,other,and,some,other |
|
re.split() FOR PATTERN-BASED DELIMITERS |
|
Ex. 4.63 | Regex split. Split the user-input comma-separated values string into separate digit values. |
import re
ui = '23, 14, 7,3,9'
numbers = re.split(r'', ui)
print(numbers)
|
|
Expected Output: |
|
['23', '14', '7', '3', '9'] |
|
Ex. 4.64 | DOTALL wildcard match. |
Extract everything between =code start= and = code end =. Use the re.DOTALL switch to use the wildcard (.) to match on a newline. |
|
import re
text = """Title of This Text
This is some description...
=code start=
a = 5
b = 5.0
if a == b:
print('yes')
=code end=
This is some discussion...
"""
matchobj = re.search(r'', text)
print(matchobj.group(1))
|
|
Expected Output: |
|
a = 5 b = 5.0 if a == b: print('yes') |
|
Ex. 4.65 | Multiline anchors. |
Use findall() to extra numbers from only the start of each line of the text. Use re.MULTILINE to allow the carrot (^) to match at the start of any line. |
|
import re
text = """Title of This Text
23 we want to grab some 99 numbers
12 but not others, 17 and then some
5 so we just get 1 the ones
on the left side
93 and me and 23 too
"""
matches = re.findall(r'', text)
print(matches)
|
|
Expected Output: |
|
['23', '12', '5', '93'] |
|