Source code for melusine.prepare_email.manage_transfer_reply

import re
from melusine import config

regex_transfer_header = config["regex"]["manage_transfer_reply"]["transfer_header"]
regex_answer_header = config["regex"]["manage_transfer_reply"]["answer_header"]
regex_begin_transfer = config["regex"]["manage_transfer_reply"]["begin_transfer"]
regex_begin_transfer_cons = config["regex"]["manage_transfer_reply"][
    "begin_transfer_cons"
]
regex_extract_from = config["regex"]["manage_transfer_reply"]["extract_from"]
regex_extract_to = config["regex"]["manage_transfer_reply"]["extract_to"]
regex_extract_date = config["regex"]["manage_transfer_reply"]["extract_date"]
regex_extract_header = config["regex"]["manage_transfer_reply"]["extract_header"]


[docs]def add_boolean_transfer(row):
    """Compute boolean Series which return True if the "header" starts with given
    regex 'answer_subject', False if not.

    To be used with methods such as: `apply(func, axis=1)` or
    `apply_by_multiprocessing(func, axis=1, **kwargs)`.

    Parameters
    ----------
    row : row of pd.Dataframe, columns ['header']

    Returns
    -------
    pd.Series

    Examples
    --------
        >>> import pandas as pd
        >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle')
        >>> # data contains a 'header' column

        >>> from melusine.prepare_email.manage_transfer_reply import add_boolean_transfer
        >>> add_boolean_transfer(data.iloc[0])  # apply for 1 sample
        >>> data.apply(add_boolean_transfer, axis=1)  # apply to all samples

    """
    is_transfer = False
    try:
        if re.match(regex_transfer_header, row["header"]):
            is_transfer = True
    except Exception:
        pass

    return is_transfer


[docs]def add_boolean_answer(row):
    """Compute boolean Series which return True if the "header" starts with given
    regex 'transfer_subject', False if not.

    To be used with methods such as: `apply(func, axis=1)` or
    `apply_by_multiprocessing(func, axis=1, **kwargs)`.

    Parameters
    ----------
    row : row of pd.Dataframe, columns ['header']

    Returns
    -------
    pd.Series

    Examples
    --------
        >>> import pandas as pd
        >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle')
        >>> # data contains a 'header' column

        >>> from melusine.prepare_email.manage_transfer_reply import add_boolean_answer
        >>> add_boolean_answer(data.iloc[0])  # apply for 1 sample
        >>> data.apply(add_boolean_answer, axis=1)  # apply to all samples

    """
    is_answer = False
    try:
        if re.match(regex_answer_header, row["header"]):
            is_answer = True
    except Exception:
        pass

    return is_answer


[docs]def check_mail_begin_by_transfer(row):
    """Compute boolean Series which return True if the "body" starts with given
    regex 'begin_transfer', False if not.

    To be used with methods such as: `apply(func, axis=1)` or
    `apply_by_multiprocessing(func, axis=1, **kwargs)`.

    Parameters
    ----------
    row : row of pd.Dataframe, columns ['body']

    Returns
    -------
    pd.Series

    Examples
    --------
        >>> import pandas as pd
        >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle')
        >>> # data contains a 'body' column

        >>> from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer
        >>> check_mail_begin_by_transfer(data.iloc[0])  # apply for 1 sample
        >>> data.apply(check_mail_begin_by_transfer, axis=1)  # apply to all samples

    """
    is_begin_by_transfer = False
    try:
        if re.search(regex_begin_transfer, row["body"]):
            is_begin_by_transfer = True
        if re.search(regex_begin_transfer_cons, row["body"]):
            is_begin_by_transfer = True
    except Exception:
        pass

    return is_begin_by_transfer


[docs]def update_info_for_transfer_mail(row):
    """Extracts and updates informations from forwarded mails, such as: body,
    from, to, header, date.
    - It changes the header by the initial subject (extracted from forward
    email).
    - It removes the header from emails' body.

    To be used with methods such as: `apply(func, axis=1)` or
    `apply_by_multiprocessing(func, axis=1, **kwargs)`.

    Parameters
    ----------
    row : row of pd.Dataframe,
    columns ['body', 'header', 'from', 'to', 'date', 'is_begin_by_transfer']

    Returns
    -------
    pd.DataFrame

    Examples
    --------
        >>> import pandas as pd
        >>> from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer
        >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle')
        >>> data['is_begin_by_transfer'] = data.apply(check_mail_begin_by_transfer, axis=1)
        >>> # data contains columns ['from', 'to', 'date', 'header', 'body', 'is_begin_by_transfer']

        >>> from melusine.prepare_email.manage_transfer_reply import update_info_for_transfer_mail
        >>> update_info_for_transfer_mail(data.iloc[0])  # apply for 1 sample
        >>> data.apply(update_info_for_transfer_mail, axis=1)  # apply to all samples

    """
    try:
        if row["is_begin_by_transfer"]:
            row["from"] = re.split(regex_extract_from, row["body"])[1]
            row["to"] = re.split(regex_extract_to, row["body"])[1]
            row["date"] = re.split(regex_extract_date, row["body"])[1]
            row["header"] = re.split(regex_extract_header, row["body"])[1]
            row["body"] = "".join(
                row["body"].split(re.findall(regex_extract_header, row["body"])[0])[1:]
            )

    except Exception:
        pass

    return row