Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
with db_session_from_cmd_out(result) as session:
# Verify total message count
assert session.query(Message).count() == 9297
# Get message contents from DB
msg = session.query(Message).filter_by(pff_identifier=msg_id).one()
headers, body = msg.headers, msg.body
if expected.with_messages:
# Access message directly and compare
archive_file = list(enron_dataset_part027.glob("*.pst"))[0]
with open_mail_archive(archive_file) as archive:
message = archive.get_message_by_id(msg_id)
assert cleanup_message_body(*archive.get_message_body(message)) == body
assert archive.get_message_headers(message) == headers
else:
assert headers is None
assert body is None
def test_cleanup_message_body(body, body_type, result):
assert cleanup_message_body(body, body_type) == result
# Run entity extraction job with message content flag on
result = extract_entities(
params, enron_dataset_part001, isolated_cli_runner, expected
)
# Get message contents from DB
with db_session_from_cmd_out(result) as session:
msg = session.query(Message).filter_by(pff_identifier=msg_id).one()
headers, body = msg.headers, msg.body
# Access message directly and compare
archive_file = list(enron_dataset_part001.glob("*.pst"))[0]
with open_mail_archive(archive_file) as archive:
message = archive.get_message_by_id(msg_id)
assert cleanup_message_body(*archive.get_message_body(message)) == body
assert archive.get_message_headers(message) == headers
"""
Job function for the worker processes
"""
# Return basic types to avoid serialization issues
res = {
"filepath": filepath,
"message_id": message_id,
"date": date,
"processing_start_time": datetime.utcnow(),
"attachments": attachments,
}
try:
# Extract entities from the message
message_body = cleanup_message_body(
body, body_type, RATOM_SPACY_MODEL_MAX_LENGTH
)
doc = spacy_model(message_body)
res["entities"] = [(ent.text, ent.label_) for ent in doc.ents]
res["processing_end_time"] = datetime.utcnow()
if include_message_contents:
res["body"] = message_body
res["headers"] = headers
return res, None
except Exception as exc:
return res, str(exc)
try:
for msg_info in get_messages(
files,
progress_callback=update_progress,
with_content=include_message_contents,
with_headers=include_message_contents,
):
# Extract results
message_id = msg_info.pop("message_id")
filepath = msg_info.pop("filepath")
attachments = msg_info.pop("attachments")
if include_message_contents:
msg_info["body"] = cleanup_message_body(
msg_info["body"], msg_info.pop("body_type")
)
# Create new message instance
message = Message(pff_identifier=message_id, **msg_info)
# Link message to a file_report
try:
file_report = session.query(FileReport).filter_by(path=filepath).one()
except Exception as exc:
file_report = None
logger.info(
f"Unable to link message id {message_id} to a file. Error: {exc}"
)
message.file_report = file_report