|
| 1 | +import os |
| 2 | +import re |
| 3 | +import email |
| 4 | +from email import policy |
| 5 | +from email.parser import BytesParser |
| 6 | +from email.header import decode_header |
| 7 | +from datetime import datetime |
| 8 | +from bs4 import BeautifulSoup |
| 9 | +from docx import Document |
| 10 | + |
| 11 | +folder_path = "data" |
| 12 | +output_data = [] |
| 13 | + |
| 14 | +# Чтение и парсинг писем |
| 15 | +for filename in os.listdir(folder_path): |
| 16 | + if filename.endswith(".eml"): |
| 17 | + eml_path = os.path.join(folder_path, filename) |
| 18 | + with open(eml_path, "rb") as file: |
| 19 | + msg = BytesParser(policy=policy.default).parse(file) |
| 20 | + |
| 21 | + date = msg["Date"] |
| 22 | + from_ = msg["From"] |
| 23 | + to = msg["To"] |
| 24 | + subject = msg["Subject"] |
| 25 | + |
| 26 | + attachment_header = msg["Content-Disposition"] |
| 27 | + body = msg.get_body(preferencelist=("plain", "html")) |
| 28 | + body_content = body.get_content() if body else "" |
| 29 | + |
| 30 | + # Преобразование HTML в текст |
| 31 | + if body and body.get_content_type() == "text/html": |
| 32 | + soup = BeautifulSoup(body_content, "html.parser") |
| 33 | + body_content = soup.get_text() |
| 34 | + |
| 35 | + # Очистка от лишних пробелов и пустых строк |
| 36 | + body_content = "\n".join( |
| 37 | + re.sub(r"\s{2,}", " ", line.strip()) |
| 38 | + for line in body_content.splitlines() |
| 39 | + if line.strip() |
| 40 | + ) |
| 41 | + |
| 42 | + # Преобразование даты |
| 43 | + try: |
| 44 | + parsed_date = datetime.strptime(date[:-6], "%a, %d %b %Y %H:%M:%S") |
| 45 | + formatted_date = parsed_date.strftime("%d.%m.%Y %H:%M") |
| 46 | + except Exception as e: |
| 47 | + formatted_date = date |
| 48 | + |
| 49 | + attachments = [] |
| 50 | + for part in msg.iter_attachments(): |
| 51 | + attach_name = part.get_filename() |
| 52 | + if attach_name: |
| 53 | + attachments.append(attach_name) |
| 54 | + |
| 55 | + # Сбор данных |
| 56 | + output_data.append({ |
| 57 | + "Дата/время": formatted_date, |
| 58 | + "Отправитель (от кого)": from_, |
| 59 | + "Получатель (кому)": to, |
| 60 | + "Содержание письма / Тема": f"Тема: {subject}\n\n{body_content}\n\n", |
| 61 | + "Названия вложений": ", ".join(attachments) if attachments else " " |
| 62 | + }) |
| 63 | + |
| 64 | +print(f"\nВсего обработано писем: {len(output_data)}") |
| 65 | + |
| 66 | +# Создание Word-файла и таблицы |
| 67 | +doc = Document() |
| 68 | +doc.add_heading('Список писем', 0) |
| 69 | + |
| 70 | +table = doc.add_table(rows=1, cols=5) |
| 71 | +table.style = 'Table Grid' |
| 72 | + |
| 73 | +# Заголовки |
| 74 | +hdr_cells = table.rows[0].cells |
| 75 | +hdr_cells[0].text = 'Дата/время' |
| 76 | +hdr_cells[1].text = 'Отправитель (от кого)' |
| 77 | +hdr_cells[2].text = 'Получатель (кому)' |
| 78 | +hdr_cells[3].text = 'Содержание письма / Тема' |
| 79 | +hdr_cells[4].text = 'Названия вложений' |
| 80 | + |
| 81 | +# Добавление строк |
| 82 | +for i, email_data in enumerate(output_data): |
| 83 | + row_cells = table.add_row().cells |
| 84 | + row_cells[0].text = email_data["Дата/время"] |
| 85 | + row_cells[1].text = email_data["Отправитель (от кого)"] |
| 86 | + row_cells[2].text = email_data["Получатель (кому)"] |
| 87 | + row_cells[3].text = email_data["Содержание письма / Тема"] |
| 88 | + row_cells[4].text = email_data["Названия вложений"] |
| 89 | + |
| 90 | +# Сохранение .docx файла |
| 91 | +doc_path = "emails.docx" |
| 92 | +doc.save(doc_path) |
| 93 | + |
| 94 | +print(f"\nФайл Word с таблицей сохранён: {doc_path}") |
| 95 | +print(f"Обработано {len(output_data)} писем.") |
0 commit comments