"""
Cleaning of the body and the header
"""
import unidecode
import unicodedata
import re
from melusine import config
REGEX_CLEAN = config["regex"]["cleaning"]
regex_flags_dict = config["text_flagger"]["text_flags"]
regex_clean_header_dict = REGEX_CLEAN["clean_header_dict"]
regex_remove_multiple_spaces_list = REGEX_CLEAN["remove_multiple_spaces_list"]
[docs]def clean_body(row, flags=True):
"""Clean body column. The cleaning involves the following operations:
- Cleaning the text
- Removing the multiple spaces
- Flagging specific items (postal code, phone number, date...)
Parameters
----------
row : row of pandas.Dataframe object,
Data contains 'last_body' column.
flags : boolean, optional
True if you want to flag relevant info, False if not.
Default value, True.
Returns
-------
row of pandas.DataFrame object or pandas.Series if apply to all DF.
"""
text = str(row["last_body"])
clean_body = clean_text(text)
clean_body = flag_items(clean_body, flags=flags)
return clean_body
[docs]def clean_text(text):
"""Clean a string. The cleaning involves the following operations:
- Putting all letters to lowercase
- Removing all the accents
- Removing all line breaks
- Removing all symbols and punctuations
- Removing the multiple spaces
Parameters
----------
text : str
Returns
-------
str
"""
text = text_to_lowercase(text)
text = remove_accents(text)
text = remove_line_break(text)
text = remove_superior_symbol(text)
# text = remove_apostrophe(text)
text = remove_multiple_spaces_and_strip_text(text)
return text
[docs]def text_to_lowercase(text):
"""Set all letters to lowercase"""
return text.lower()
[docs]def remove_accents(text, use_unidecode=False):
"""
Remove accents from text
Using unidecode is more powerful but much more time consuming
Exemple: the joined 'ae' character is converted to 'a' + 'e' by unidecode while it is suppressed by unicodedata.
"""
if use_unidecode:
return unidecode.unidecode(text)
else:
utf8_str = (
unicodedata.normalize("NFKD", text)
.encode("ASCII", "ignore")
.decode("utf-8")
)
return utf8_str
[docs]def remove_line_break(text):
"""Remove line breaks from text"""
return text.replace("\n", "")
[docs]def remove_superior_symbol(text):
"""Remove superior and inferior symbols from text"""
text = text.replace(">", "")
text = text.replace("<", "")
return text
[docs]def remove_apostrophe(text):
"""Remove apostrophes from text"""
return text.replace("'", " ")
[docs]def remove_multiple_spaces_and_strip_text(text):
"""Remove multiple spaces, strip text, and remove '-', '*' characters.
Parameters
----------
text : str,
Header content.
Returns
-------
str
"""
for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list:
text = re.sub(regex_remove_multiple_spaces, " ", text)
text = text.strip()
return text
[docs]def flag_items(text, flags=True):
"""Flag relevant information
ex : amount, phone number, email address, postal code (5 digits)..
Parameters
----------
text : str,
Body content.
flags : boolean, optional
True if you want to flag relevant info, False if not.
Default value, True.
Returns
-------
str
"""
if flags:
for regex, value in regex_flags_dict.items():
text = re.sub(pattern=regex, repl=value, string=text, flags=re.IGNORECASE)
return text
else:
return text