In this blog we will capture date, phone number and email from text with the help of regular expression.
If you are new to regular expression, you can start with following blog:
In [1]:
import re
In [2]:
text = "Kaifi Azmi (born Athar Husain Rizvi; 14 January 1919 – 10 May 2002) was an Indian Urdu poet. \
He is remembered as the one who brought Urdu literature to Indian motion pictures. Shaukat Kaifi was born into a \
Shia Muslim family of Uttar Pradesh migrants in Hyderabad State. She grew up in Aurangabad, India. At a young age, \
she fell in love and married the Urdu poet Kaifi Azmi.Shaukat Kaifi (21 October 1926 – 22 November 2019) \
also credited as Shaukat Azmi, was an Indian theater and film actress. Shaukat and Kaifi's daughter, Shabana Azmi \
(born 18 September 1950) is an Indian actress of Hindi film, television and theatre."
print(text)
In [4]:
#pattern = "\d{2}"
#pattern = "\d{2}\s"
#pattern = "\d{2}\s[A-Za-z]*"
#pattern = "\d{2}\s[A-Za-z]*\s"
#pattern = "\d{2}\s[A-Za-z]*\s[0-9]{4}"
pattern = "\d{2}\s[A-Za-z]*\s\d{4}"
In [5]:
matches = re.findall(pattern, text)
for match in matches:
print(match)
pass
In [5]:
text = "today is Jan 28, 2022 and tomorrow call me at 234 567-8763 or 234-578-8763"
print(text)
In [6]:
#pattern = "\d{3}"
#pattern = "\d{3}[-]"
#pattern = "\d{3}[-\s]"
#pattern = "\d{3}[-\s]\d{3}"
#pattern = "\d{3}[-\s]\d{3}-\d{4}"
#another pattern
pattern = "[0-9\s\-]{10,13}"
In [7]:
matches = re.findall(pattern, text)
for match in matches:
print(match)
pass
In [2]:
text = "Tomorrow we are going to watch movie and after that dinner. Rohan you will inform \
John, kartik and Manisha. Their numbers are +91 8124564397, +91 (755) 322 6754 and +1 (812)-654-6754. \
Please do't forget to call them. "
print(text)
In [13]:
#pattern = "\+\d{2}\s\d{10}"
#pattern = "\+\d{1,3}\s[\(]\d{3}[\)][\s-]\d{3}[\s-]\d{4}"
pattern = "\+\d{2}\s\d{10}|\+\d{1,3}\s[\(]\d{3}[\)][\s-]\d{3}[\s-]\d{4}"
In [14]:
matches = re.findall(pattern, text)
for match in matches:
print(match)
pass
In [21]:
text = "05/3/2017 3/01/2017 1/6/17 34/11/937 may 21, 2017 21st mar 2017"
print(text)
In [12]:
patterndays = "(0?[1-9])"
patterndays = "((0?[1-9])|([12][0-9])|(3[01]))"
patternmonth = "((0?[1-9])|(1[0-2]))"
patternyear = "((19[0-9]{2})|(20[0-9]{2}))"
patternsep = "/"
In [13]:
pattern = patterndays + patternsep + patternmonth + patternsep + patternyear
print(pattern)
In [14]:
n = re.finditer(pattern, text)
In [15]:
for item in n:
print(item)
\w matches - [a-zA-Z0-9_]
\d matches - [0-9]
. matches - any character except a newline
In [16]:
candidates = [
'info@xcelvations.com',
'nutan.xcelvations@gmail.com',
'training@mail.xcelvations.com',
'training123@xcelvations.in',
'not-valid@example.zoo',
'nutan@yahoo.com',
'john_mathew@yahoo.xyz'
]
In [17]:
pattern = '[\w\d.+-]+@([\w\d.]+\.)+(com|in)'
In [18]:
address = re.compile(pattern)
In [19]:
for candidate in candidates:
match = address.search(candidate)
print('{:<30} {}'.format(
candidate, 'Matches' if match else 'No match')
)
In [ ]: