Source code for melusine.prepare_email.manage_transfer_reply

import re
from melusine import config

regex_transfer_header = config["regex"]["manage_transfer_reply"]["transfer_header"]
regex_answer_header = config["regex"]["manage_transfer_reply"]["answer_header"]
regex_begin_transfer = config["regex"]["manage_transfer_reply"]["begin_transfer"]
regex_begin_transfer_cons = config["regex"]["manage_transfer_reply"][
    "begin_transfer_cons"
]
regex_extract_from = config["regex"]["manage_transfer_reply"]["extract_from"]
regex_extract_to = config["regex"]["manage_transfer_reply"]["extract_to"]
regex_extract_date = config["regex"]["manage_transfer_reply"]["extract_date"]
regex_extract_header = config["regex"]["manage_transfer_reply"]["extract_header"]


[docs]def add_boolean_transfer(row): """Compute boolean Series which return True if the "header" starts with given regex 'answer_subject', False if not. To be used with methods such as: `apply(func, axis=1)` or `apply_by_multiprocessing(func, axis=1, **kwargs)`. Parameters ---------- row : row of pd.Dataframe, columns ['header'] Returns ------- pd.Series Examples -------- >>> import pandas as pd >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') >>> # data contains a 'header' column >>> from melusine.prepare_email.manage_transfer_reply import add_boolean_transfer >>> add_boolean_transfer(data.iloc[0]) # apply for 1 sample >>> data.apply(add_boolean_transfer, axis=1) # apply to all samples """ is_transfer = False try: if re.match(regex_transfer_header, row["header"]): is_transfer = True except Exception: pass return is_transfer
[docs]def add_boolean_answer(row): """Compute boolean Series which return True if the "header" starts with given regex 'transfer_subject', False if not. To be used with methods such as: `apply(func, axis=1)` or `apply_by_multiprocessing(func, axis=1, **kwargs)`. Parameters ---------- row : row of pd.Dataframe, columns ['header'] Returns ------- pd.Series Examples -------- >>> import pandas as pd >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') >>> # data contains a 'header' column >>> from melusine.prepare_email.manage_transfer_reply import add_boolean_answer >>> add_boolean_answer(data.iloc[0]) # apply for 1 sample >>> data.apply(add_boolean_answer, axis=1) # apply to all samples """ is_answer = False try: if re.match(regex_answer_header, row["header"]): is_answer = True except Exception: pass return is_answer
[docs]def check_mail_begin_by_transfer(row): """Compute boolean Series which return True if the "body" starts with given regex 'begin_transfer', False if not. To be used with methods such as: `apply(func, axis=1)` or `apply_by_multiprocessing(func, axis=1, **kwargs)`. Parameters ---------- row : row of pd.Dataframe, columns ['body'] Returns ------- pd.Series Examples -------- >>> import pandas as pd >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') >>> # data contains a 'body' column >>> from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer >>> check_mail_begin_by_transfer(data.iloc[0]) # apply for 1 sample >>> data.apply(check_mail_begin_by_transfer, axis=1) # apply to all samples """ is_begin_by_transfer = False try: if re.search(regex_begin_transfer, row["body"]): is_begin_by_transfer = True if re.search(regex_begin_transfer_cons, row["body"]): is_begin_by_transfer = True except Exception: pass return is_begin_by_transfer
[docs]def update_info_for_transfer_mail(row): """Extracts and updates informations from forwarded mails, such as: body, from, to, header, date. - It changes the header by the initial subject (extracted from forward email). - It removes the header from emails' body. To be used with methods such as: `apply(func, axis=1)` or `apply_by_multiprocessing(func, axis=1, **kwargs)`. Parameters ---------- row : row of pd.Dataframe, columns ['body', 'header', 'from', 'to', 'date', 'is_begin_by_transfer'] Returns ------- pd.DataFrame Examples -------- >>> import pandas as pd >>> from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') >>> data['is_begin_by_transfer'] = data.apply(check_mail_begin_by_transfer, axis=1) >>> # data contains columns ['from', 'to', 'date', 'header', 'body', 'is_begin_by_transfer'] >>> from melusine.prepare_email.manage_transfer_reply import update_info_for_transfer_mail >>> update_info_for_transfer_mail(data.iloc[0]) # apply for 1 sample >>> data.apply(update_info_for_transfer_mail, axis=1) # apply to all samples """ try: if row["is_begin_by_transfer"]: row["from"] = re.split(regex_extract_from, row["body"])[1] row["to"] = re.split(regex_extract_to, row["body"])[1] row["date"] = re.split(regex_extract_date, row["body"])[1] row["header"] = re.split(regex_extract_header, row["body"])[1] row["body"] = "".join( row["body"].split(re.findall(regex_extract_header, row["body"])[0])[1:] ) except Exception: pass return row