@@ -294,83 +294,119 @@ See [Python docs - Compilation Flags](https://docs.python.org/3/howto/regex.html
294294
295295### <a name =" pattern-matching-and-extraction " ></a >Pattern matching and extraction
296296
297- * matching/extracting sequence of characters
298- * use ` re.search() ` to see if a string contains a pattern or not
299- * use ` re.findall() ` to get a list of matching patterns
300- * use ` re.split() ` to get a list from splitting a string based on a pattern
301- * their syntax given below
297+ To match/extract sequence of characters, use
298+
299+ * ` re.search() ` to see if input string contains a pattern or not
300+ * ` re.findall() ` to get a list of all matching patterns
301+ * ` re.split() ` to get a list from splitting input string based on a pattern
302+
303+ Their syntax is as follows:
302304
303305``` python
304306re.search(pattern, string, flags = 0 )
305307re.findall(pattern, string, flags = 0 )
306308re.split(pattern, string, maxsplit = 0 , flags = 0 )
307309```
308310
311+ * As a good practice, always use ** raw strings** to construct RE, unless other formats are required
312+ * this will avoid clash of backslash escaping between RE and normal quoted strings
313+ * examples for ` re.search `
314+
309315``` python
310- >> > import re
311- >> > string = " This is a sample string"
316+ >> > sentence = ' This is a sample string'
312317
313- >> > bool (re.search(' is' , string))
318+ # using normal string methods
319+ >> > ' is' in sentence
314320True
315-
316- >> > bool (re.search(' this' , string))
321+ >> > ' xyz' in sentence
317322False
318323
319- >> > bool (re.search( ' this ' , string, re.I))
320- True
321-
322- >> > bool (re.search(' T ' , string ))
324+ # need to load the re module before use
325+ >> > import re
326+ # check if 'sentence' contains the pattern described by RE argument
327+ >> > bool (re.search(r ' is ' , sentence ))
323328True
324-
325- >> > bool (re.search(' is a' , string))
329+ >> > bool (re.search(r ' this' , sentence, flags = re.I))
326330True
327-
328- >> > re.findall(' i' , string)
329- [' i' , ' i' , ' i' ]
331+ >> > bool (re.search(r ' xyz' , sentence))
332+ False
330333```
331334
332- * using regular expressions
333- * use the ` r'' ` format when using regular expression elements
335+ * examples for ` re.findall `
334336
335337``` python
336- >> > string
337- ' This is a sample string'
338-
339- >> > re.findall(' is' , string)
340- [' is' , ' is' ]
341-
342- >> > re.findall(' \b is' , string)
343- []
338+ # match whole word par with optional s at start and e at end
339+ >> > re.findall(r ' \b s? pare? \b ' , ' par spar apparent spare part pare' )
340+ [' par' , ' spar' , ' spare' , ' pare' ]
341+
342+ # numbers >= 100 with optional leading zeros
343+ >> > re.findall(r ' \b 0* [1-9 ]\d {2,} \b ' , ' 0501 035 154 12 26 98234' )
344+ [' 0501' , ' 154' , ' 98234' ]
345+
346+ # if multiple capturing groups are used, each element of output
347+ # will be a tuple of strings of all the capture groups
348+ >> > re.findall(r ' ( x* ) :( y* ) ' , ' xx:yyy x: x:yy :y' )
349+ [(' xx' , ' yyy' ), (' x' , ' ' ), (' x' , ' yy' ), (' ' , ' y' )]
350+
351+ # normal capture group will hinder ability to get whole match
352+ # non-capturing group to the rescue
353+ >> > re.findall(r ' \b\w * (?: st| in) \b ' , ' cost akin more east run against' )
354+ [' cost' , ' akin' , ' east' , ' against' ]
355+
356+ # useful for debugging purposes as well before applying substitution
357+ >> > re.findall(r ' t. *? a' , ' that is quite a fabricated tale' )
358+ [' tha' , ' t is quite a' , ' ted ta' ]
359+ ```
344360
345- >> > re.findall(r ' \b is' , string)
346- [' is' ]
361+ * examples for ` re.split `
347362
348- >> > re.findall(r ' \w + ' , string)
349- [' This' , ' is' , ' a' , ' sample' , ' string' ]
363+ ``` python
364+ # split based on one or more digit characters
365+ >> > re.split(r ' \d + ' , ' Sample123string42with777numbers' )
366+ [' Sample' , ' string' , ' with' , ' numbers' ]
350367
351- >> > re.split(r ' \s + ' , string)
352- [' This' , ' is' , ' a' , ' sample' , ' string' ]
368+ # split based on digit or whitespace characters
369+ >> > re.split(r ' [\d\s ]+ ' , ' **1\f 2\n 3star\t 7 77\r **' )
370+ [' **' , ' star' , ' **' ]
353371
354- >> > re.split(r ' \d + ' , ' Sample123string54with908numbers' )
355- [' Sample' , ' string' , ' with' , ' numbers' ]
372+ # to include the matching delimiter strings as well in the output
373+ >> > re.split(r ' ( \d + ) ' , ' Sample123string42with777numbers' )
374+ [' Sample' , ' 123' , ' string' , ' 42' , ' with' , ' 777' , ' numbers' ]
356375
357- >> > re.split(r ' ( \d + ) ' , ' Sample123string54with908numbers' )
358- [' Sample' , ' 123' , ' string' , ' 54' , ' with' , ' 908' , ' numbers' ]
376+ # use non-capturing group if capturing is not needed
377+ >> > re.split(r ' hand(?: y| ful) ' , ' 123handed42handy777handful500' )
378+ [' 123handed42' , ' 777' , ' 500' ]
359379```
360380
361381* backreferencing
362382
363383``` python
364- >> > quote = " So many books, so little time"
384+ # whole words that have at least one consecutive repeated character
385+ >> > words = [' effort' , ' flee' , ' facade' , ' oddball' , ' rat' , ' tool' ]
365386
366- >> > re.search(r ' ( [a-z ]{2,} ) . * \1 ' , quote, re.I)
367- < _sre.SRE_Match object ; span= (0 , 17 ), match= ' So many books, so' >
387+ >> > [w for w in words if re.search(r ' \b\w * ( \w ) \1 \w * \b ' , w)]
388+ [' effort' , ' flee' , ' oddball' , ' tool' ]
389+ ```
368390
369- >> > re.search(r ' ( [a-z ]) \1 ' , quote, re.I)
370- < _sre.SRE_Match object ; span= (9 , 11 ), match= ' oo' >
391+ * The ` re.search ` function returns a ` re.Match ` object from which various details can be extracted
392+ like the matched portion of string, location of matched portion, etc
393+ * ** Note** that output here is shown for Python version ** 3.7**
371394
372- >> > re.findall(r ' ( [a-z ]) \1 ' , quote, re.I)
373- [' o' , ' t' ]
395+ ``` python
396+ >> > re.search(r ' b. * d' , ' abc ac adc abbbc' )
397+ < re.Match object ; span= (1 , 9 ), match= ' bc ac ad' >
398+ # retrieving entire matched portion
399+ >> > re.search(r ' b. * d' , ' abc ac adc abbbc' )[0 ]
400+ ' bc ac ad'
401+
402+ # capture group example
403+ >> > m = re.search(r ' a( . * ) d( . * a) ' , ' abc ac adc abbbc' )
404+ # to get matched portion of second capture group
405+ >> > m[2 ]
406+ ' c a'
407+ # to get a tuple of all the capture groups
408+ >> > m.groups()
409+ (' bc ac a' , ' c a' )
374410```
375411
376412<br >
@@ -383,55 +419,61 @@ True
383419re.sub(pattern, repl, string, count = 0 , flags = 0 )
384420```
385421
386- * simple substitutions
387- * ` re.sub ` will not change value of variable passed to it, has to be explicity assigned
422+ * examples
423+ * ** Note** that as strings are immutable, ` re.sub ` will not change value of variable
424+ passed to it, has to be explicity assigned
388425
389426``` python
390- >> > sentence = ' This is a sample string'
391- >> > re.sub(' sample' , ' test' , sentence)
392- ' This is a test string'
393-
394- >> > sentence
395- ' This is a sample string'
396- >> > sentence = re.sub(' sample' , ' test' , sentence)
397- >> > sentence
398- ' This is a test string'
399-
400- >> > re.sub(' /' , ' -' , ' 25/06/2016' )
401- ' 25-06-2016'
402- >> > re.sub(' /' , ' -' , ' 25/06/2016' , count = 1 )
403- ' 25-06/2016'
404-
405- >> > greeting = ' ***** Have a great day *****'
406- >> > re.sub(' \*' , ' =' , greeting)
407- ' ===== Have a great day ====='
427+ >> > ip_lines = " catapults\n concatenate\n cat"
428+ >> > print (re.sub(r ' ^ ' , r ' * ' , ip_lines, flags = re.M))
429+ * catapults
430+ * concatenate
431+ * cat
432+
433+ # replace 'par' only at start of word
434+ >> > re.sub(r ' \b par' , r ' X' , ' par spar apparent spare part' )
435+ ' X spar apparent spare Xt'
436+
437+ # same as: r'part|parrot|parent'
438+ >> > re.sub(r ' par( en| ro) ? t' , r ' X' , ' par part parrot parent' )
439+ ' par X X X'
440+
441+ # remove first two columns where : is delimiter
442+ >> > re.sub(r ' \A ( [^ : ]+ :) {2} ' , r ' ' , ' foo:123:bar:baz' , count = 1 )
443+ ' bar:baz'
408444```
409445
410446* backreferencing
411447
412448``` python
413- >> > words = ' night and day'
414- >> > re.sub(r ' ( \w + ) ( \w + ) ( \w + ) ' , r ' \3\2\1 ' , words)
415- ' day and night'
416-
417- >> > line = ' Can you spot the the mistakes? I i seem to not'
418- >> > re.sub(r ' \b ( \w + ) \1 \b ' , r ' \1 ' , line, flags = re.I)
419- ' Can you spot the mistakes? I seem to not'
449+ # remove any number of consecutive duplicate words separated by space
450+ # quantifiers can be applied to backreferences too!
451+ >> > re.sub(r ' \b ( \w + ) ( \1 ) + \b ' , r ' \1 ' , ' a a a walking for for a cause' )
452+ ' a walking for a cause'
453+
454+ # add something around the matched strings
455+ >> > re.sub(r ' \d + ' , r ' ( \g <0>0) ' , ' 52 apples and 31 mangoes' )
456+ ' (520) apples and (310) mangoes'
457+
458+ # swap words that are separated by a comma
459+ >> > re.sub(r ' ( \w + ) ,( \w + ) ' , r ' \2 ,\1 ' , ' a,b 42,24' )
460+ ' b,a 24,42'
420461```
421462
422463* using functions in replace part of ` re.sub() `
464+ * ** Note** that Python version ** 3.7** is used here
423465
424466``` python
425- >> > import math
467+ >> > from math import factorial
426468>> > numbers = ' 1 2 3 4 5'
427-
428469>> > def fact_num (n ):
429- ... return str (math. factorial(int (n.group( 1 ) )))
470+ ... return str (factorial(int (n[ 0 ] )))
430471...
431- >> > re.sub(r ' ( \d + ) ' , fact_num, numbers)
472+ >> > re.sub(r ' \d + ' , fact_num, numbers)
432473' 1 2 6 24 120'
433474
434- >> > re.sub(r ' ( \d + ) ' , lambda m : str (math.factorial(int (m.group(1 )))), numbers)
475+ # using lambda
476+ >> > re.sub(r ' \d + ' , lambda m : str (factorial(int (m[0 ]))), numbers)
435477' 1 2 6 24 120'
436478```
437479
@@ -443,35 +485,28 @@ re.sub(pattern, repl, string, count=0, flags=0)
443485
444486### <a name =" compiling-regular-expressions " ></a >Compiling Regular Expressions
445487
488+ * Regular expressions can be compiled using ` re.compile ` function, which gives back a
489+ ` re.Pattern ` object
490+ * The top level ` re ` module functions are all available as methods for this object
491+ * Compiling a regular expression helps if the RE has to be used in multiple
492+ places or called upon multiple times inside a loop (speed benefit)
493+ * By default, Python maintains a small list of recently used RE, so the speed benefit
494+ doesn't apply for trivial use cases
495+
446496``` python
447- >> > swap_words = re.compile(r ' ( \w + ) ( \w + ) ( \w + ) ' )
448- >> > swap_words
449- re.compile(' (\\ w+)( \\ w+ )(\\ w+)' )
450-
451- >> > words = ' night and day'
452-
453- >> > swap_words.search(words).group()
454- ' night and day'
455- >> > swap_words.search(words).group(1 )
456- ' night'
457- >> > swap_words.search(words).group(2 )
458- ' and '
459- >> > swap_words.search(words).group(3 )
460- ' day'
461- >> > swap_words.search(words).group(4 )
462- Traceback (most recent call last):
463- File " <stdin>" , line 1 , in < module>
464- IndexError : no such group
465-
466- >> > bool (swap_words.search(words))
497+ >> > pet = re.compile(r ' dog' )
498+ >> > type (pet)
499+ < class ' re.Pattern' >
500+ >> > bool (pet.search(' They bought a dog' ))
467501True
468- >> > swap_words.findall(words )
469- [( ' night ' , ' and ' , ' day ' )]
502+ >> > bool (pet.search( ' A cat crossed their path ' ) )
503+ False
470504
471- >> > swap_words.sub(r ' \3\2\1 ' , words)
472- ' day and night'
473- >> > swap_words.sub(r ' \3\2\1 ' , ' yin and yang' )
474- ' yang and yin'
505+ >> > remove_parentheses = re.compile(r ' \( [^ ) ]* \) ' )
506+ >> > remove_parentheses.sub(' ' , ' a+b(addition) - foo() + c%d (#modulo)' )
507+ ' a+b - foo + c%d'
508+ >> > remove_parentheses.sub(' ' , ' Hi there(greeting). Nice day(a(b)' )
509+ ' Hi there. Nice day'
475510```
476511
477512<br >
0 commit comments