first commit
This commit is contained in:
commit
e6c52820cd
227 changed files with 16156 additions and 0 deletions
70
modules/email/draft.py
Normal file
70
modules/email/draft.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
from typing import Optional, Dict, Any
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
|
||||
class EmailDraft:
|
||||
"""
|
||||
Représente un brouillon d'email généré automatiquement.
|
||||
Stockage fichier: Data/email_drafts/<id>.json
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
prospect_id: str,
|
||||
to_email: str,
|
||||
subject: str,
|
||||
content: str,
|
||||
status: str = "draft", # draft | sent | failed
|
||||
template_id: Optional[str] = None,
|
||||
task_id: Optional[str] = None,
|
||||
id: Optional[str] = None,
|
||||
created_at: Optional[str] = None,
|
||||
sent_at: Optional[str] = None,
|
||||
error_message: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
self.id = id or f"ed_{uuid.uuid4().hex[:10]}"
|
||||
self.prospect_id = prospect_id
|
||||
self.to_email = to_email
|
||||
self.subject = subject
|
||||
self.content = content
|
||||
self.status = status
|
||||
self.template_id = template_id
|
||||
self.task_id = task_id
|
||||
self.created_at = created_at or datetime.utcnow().isoformat()
|
||||
self.sent_at = sent_at
|
||||
self.error_message = error_message
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"id": self.id,
|
||||
"prospect_id": self.prospect_id,
|
||||
"to_email": self.to_email,
|
||||
"subject": self.subject,
|
||||
"content": self.content,
|
||||
"status": self.status,
|
||||
"template_id": self.template_id,
|
||||
"task_id": self.task_id,
|
||||
"created_at": self.created_at,
|
||||
"sent_at": self.sent_at,
|
||||
"error_message": self.error_message,
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_dict(data: Dict[str, Any]) -> "EmailDraft":
|
||||
return EmailDraft(
|
||||
id=data.get("id"),
|
||||
prospect_id=data.get("prospect_id", ""),
|
||||
to_email=data.get("to_email", ""),
|
||||
subject=data.get("subject", ""),
|
||||
content=data.get("content", ""),
|
||||
status=data.get("status", "draft"),
|
||||
template_id=data.get("template_id"),
|
||||
task_id=data.get("task_id"),
|
||||
created_at=data.get("created_at"),
|
||||
sent_at=data.get("sent_at"),
|
||||
error_message=data.get("error_message"),
|
||||
metadata=data.get("metadata") or {},
|
||||
)
|
||||
89
modules/email/draft_handler.py
Normal file
89
modules/email/draft_handler.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
import os
|
||||
import json
|
||||
from typing import List, Optional, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
from modules.email.draft import EmailDraft
|
||||
|
||||
|
||||
class DraftHandler:
|
||||
"""
|
||||
Gestionnaire de brouillons (fichiers JSON).
|
||||
Répertoire: Data/email_drafts
|
||||
"""
|
||||
def __init__(self, base_dir: Optional[str] = None):
|
||||
base_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
self.base_dir = base_dir or os.path.join(base_root, "Data", "email_drafts")
|
||||
os.makedirs(self.base_dir, exist_ok=True)
|
||||
|
||||
def _draft_path(self, draft_id: str) -> str:
|
||||
return os.path.join(self.base_dir, f"{draft_id}.json")
|
||||
|
||||
def add_draft(self, draft: EmailDraft) -> str:
|
||||
path = self._draft_path(draft.id)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(draft.to_dict(), f, ensure_ascii=False, indent=2)
|
||||
return draft.id
|
||||
|
||||
def get_draft(self, draft_id: str) -> Optional[EmailDraft]:
|
||||
path = self._draft_path(draft_id)
|
||||
if not os.path.exists(path):
|
||||
return None
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return EmailDraft.from_dict(json.load(f))
|
||||
|
||||
def update_draft(self, draft: EmailDraft) -> bool:
|
||||
path = self._draft_path(draft.id)
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(draft.to_dict(), f, ensure_ascii=False, indent=2)
|
||||
return True
|
||||
|
||||
def delete_draft(self, draft_id: str) -> bool:
|
||||
path = self._draft_path(draft_id)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
os.remove(path)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
def list_drafts(self, status: Optional[str] = None) -> List[EmailDraft]:
|
||||
drafts: List[EmailDraft] = []
|
||||
for filename in os.listdir(self.base_dir):
|
||||
if filename.endswith(".json"):
|
||||
try:
|
||||
with open(os.path.join(self.base_dir, filename), "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
d = EmailDraft.from_dict(data)
|
||||
if status is None or d.status == status:
|
||||
drafts.append(d)
|
||||
except Exception:
|
||||
continue
|
||||
# Tri: plus récents d'abord
|
||||
drafts.sort(key=lambda d: d.created_at or "", reverse=True)
|
||||
return drafts
|
||||
|
||||
def list_pending(self) -> List[EmailDraft]:
|
||||
return self.list_drafts(status="draft")
|
||||
|
||||
def mark_sent(self, draft_id: str, success: bool, error_message: Optional[str] = None) -> bool:
|
||||
d = self.get_draft(draft_id)
|
||||
if not d:
|
||||
return False
|
||||
d.status = "sent" if success else "failed"
|
||||
d.sent_at = datetime.utcnow().isoformat()
|
||||
d.error_message = None if success else (error_message or "Unknown error")
|
||||
return self.update_draft(d)
|
||||
|
||||
def find_existing_for_task(self, task_id: str) -> Optional[EmailDraft]:
|
||||
"""
|
||||
Évite les doublons: si un draft 'draft' existe déjà pour cette tâche, le renvoie.
|
||||
Les brouillons 'failed' ne bloquent pas la régénération.
|
||||
"""
|
||||
for d in self.list_drafts():
|
||||
if d.task_id == task_id and d.status == "draft":
|
||||
return d
|
||||
return None
|
||||
87
modules/email/drafts_web.py
Normal file
87
modules/email/drafts_web.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
from flask import Blueprint, request, redirect, url_for, flash, Response
|
||||
from typing import List
|
||||
from html import escape
|
||||
|
||||
from modules.email.draft_handler import DraftHandler
|
||||
from modules.email.email_manager import EmailSender
|
||||
|
||||
email_drafts_bp = Blueprint("email_drafts", __name__, url_prefix="/email/drafts")
|
||||
|
||||
|
||||
@email_drafts_bp.get("/")
|
||||
def list_drafts_page():
|
||||
"""
|
||||
Page HTML minimaliste listant les brouillons avec bouton [Envoyer].
|
||||
Pas de template Jinja requis (HTML inline pour simplicité d'intégration).
|
||||
"""
|
||||
handler = DraftHandler()
|
||||
drafts = handler.list_pending()
|
||||
|
||||
def row_html(d):
|
||||
# Contenu HTML tel quel (on suppose content déjà HTML sûr)
|
||||
return f"""
|
||||
<div style="border:1px solid #ddd; padding:12px; margin-bottom:12px; border-radius:8px;">
|
||||
<div style="display:flex; justify-content:space-between; align-items:center;">
|
||||
<h3 style="margin:0; font-size:1.05rem;">{escape(d.subject)}</h3>
|
||||
<form method="post" action="{url_for('email_drafts.send_draft')}">
|
||||
<input type="hidden" name="draft_id" value="{escape(d.id)}" />
|
||||
<button type="submit" style="padding:6px 12px;">Envoyer</button>
|
||||
</form>
|
||||
</div>
|
||||
<div style="color:#555; margin:6px 0 8px 0;">À: {escape(d.to_email)}</div>
|
||||
<div style="background:#fafafa; padding:10px; border-radius:6px;">{d.content}</div>
|
||||
<div style="font-size:12px; color:#777; margin-top:6px;">
|
||||
Prospect: {escape(d.prospect_id)} | Template: {escape(d.template_id or '-') }
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
items_html = "\n".join(row_html(d) for d in drafts) or "<p>Aucun brouillon à envoyer.</p>"
|
||||
|
||||
page = f"""
|
||||
<!doctype html>
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Brouillons d'emails</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
</head>
|
||||
<body style="max-width:920px; margin: 20px auto; font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif;">
|
||||
<h2>Brouillons d'emails à envoyer</h2>
|
||||
<div>{items_html}</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return Response(page, mimetype="text/html")
|
||||
|
||||
|
||||
@email_drafts_bp.post("/send")
|
||||
def send_draft():
|
||||
"""
|
||||
Envoi d'un brouillon sélectionné, puis mise à jour du statut.
|
||||
"""
|
||||
draft_id = (request.form.get("draft_id") or "").strip()
|
||||
if not draft_id:
|
||||
flash("Brouillon invalide", "warning")
|
||||
return redirect(url_for("email_drafts.list_drafts_page"))
|
||||
|
||||
handler = DraftHandler()
|
||||
draft = handler.get_draft(draft_id)
|
||||
if not draft:
|
||||
flash("Brouillon introuvable", "danger")
|
||||
return redirect(url_for("email_drafts.list_drafts_page"))
|
||||
|
||||
sender = EmailSender()
|
||||
try:
|
||||
res = sender.send_email(draft.to_email, draft.subject, draft.content)
|
||||
if res.get("success"):
|
||||
handler.mark_sent(draft.id, success=True)
|
||||
flash("Email envoyé.", "success")
|
||||
else:
|
||||
handler.mark_sent(draft.id, success=False, error_message=res.get("error"))
|
||||
flash("Échec de l'envoi de l'email.", "danger")
|
||||
except Exception as e:
|
||||
handler.mark_sent(draft.id, success=False, error_message=str(e))
|
||||
flash("Erreur lors de l'envoi de l'email.", "danger")
|
||||
|
||||
return redirect(url_for("email_drafts.list_drafts_page"))
|
||||
353
modules/email/email_manager.py
Normal file
353
modules/email/email_manager.py
Normal file
|
|
@ -0,0 +1,353 @@
|
|||
from typing import List, Dict, Any, Union
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from datetime import datetime
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import quote_plus
|
||||
from uuid import uuid4
|
||||
from core.data import Data
|
||||
|
||||
class EmailTemplate:
|
||||
"""Classe gérant les templates d'emails"""
|
||||
|
||||
def __init__(self, template_folder="Data/email_templates"):
|
||||
self.template_folder = template_folder
|
||||
|
||||
# Créer le dossier de templates s'il n'existe pas
|
||||
if not os.path.exists(self.template_folder):
|
||||
os.makedirs(self.template_folder)
|
||||
|
||||
def get_all_templates(self):
|
||||
"""Retourne tous les templates disponibles"""
|
||||
templates = []
|
||||
if os.path.exists(self.template_folder):
|
||||
for filename in os.listdir(self.template_folder):
|
||||
if filename.endswith('.json'):
|
||||
template_path = os.path.join(self.template_folder, filename)
|
||||
try:
|
||||
data_manager = Data(template_path)
|
||||
template_data = data_manager.load_data()
|
||||
templates.append(template_data)
|
||||
except Exception as e:
|
||||
print(f"Erreur lors du chargement du template {filename}: {e}")
|
||||
return templates
|
||||
|
||||
def get_template_by_id(self, template_id):
|
||||
"""Récupère un template par son ID"""
|
||||
template_path = os.path.join(self.template_folder, f"{template_id}.json")
|
||||
if os.path.exists(template_path):
|
||||
data_manager = Data(template_path)
|
||||
return data_manager.load_data()
|
||||
return None
|
||||
|
||||
def save_template(self, template_data):
|
||||
"""Sauvegarde un template d'email"""
|
||||
template_id = template_data.get('id')
|
||||
if not template_id:
|
||||
# Générer un ID s'il n'existe pas
|
||||
import uuid
|
||||
template_id = f"tpl_{uuid.uuid4().hex[:8]}"
|
||||
template_data['id'] = template_id
|
||||
|
||||
template_path = os.path.join(self.template_folder, f"{template_id}.json")
|
||||
data_manager = Data(template_path)
|
||||
data_manager.save_data(template_data)
|
||||
return template_data
|
||||
|
||||
def delete_template(self, template_id):
|
||||
"""Supprime un template d'email"""
|
||||
template_path = os.path.join(self.template_folder, f"{template_id}.json")
|
||||
if os.path.exists(template_path):
|
||||
os.remove(template_path)
|
||||
return True
|
||||
return False
|
||||
|
||||
def render_template(self, template_id, context=None):
|
||||
"""Rend un template avec les variables spécifiées dans le contexte"""
|
||||
template = self.get_template_by_id(template_id)
|
||||
if not template:
|
||||
return None
|
||||
|
||||
subject = template.get('subject', '')
|
||||
content = template.get('content', '')
|
||||
|
||||
# Remplacer les variables dans le sujet et le contenu
|
||||
if context:
|
||||
for key, value in context.items():
|
||||
placeholder = f"{{{{{key}}}}}"
|
||||
subject = subject.replace(placeholder, str(value))
|
||||
content = content.replace(placeholder, str(value))
|
||||
|
||||
return {
|
||||
"subject": subject,
|
||||
"content": content
|
||||
}
|
||||
|
||||
|
||||
class EmailSender:
|
||||
"""Classe gérant l'envoi d'emails"""
|
||||
|
||||
def __init__(self, config_file="config/email_config.json"):
|
||||
# Ensure config_file is an absolute path
|
||||
if not os.path.isabs(config_file):
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
self.config_file = os.path.join(base_dir, config_file)
|
||||
else:
|
||||
self.config_file = config_file
|
||||
self.config = self._load_config()
|
||||
self.template_manager = EmailTemplate()
|
||||
|
||||
def _load_config(self):
|
||||
"""Charge la configuration email depuis le fichier de configuration"""
|
||||
config_dir = os.path.dirname(self.config_file)
|
||||
if not os.path.exists(config_dir):
|
||||
os.makedirs(config_dir)
|
||||
|
||||
print(f"Loading email config from: {self.config_file}")
|
||||
if os.path.exists(self.config_file):
|
||||
try:
|
||||
with open(self.config_file, 'r') as f:
|
||||
config = json.load(f)
|
||||
print(f"Loaded email config: {config}")
|
||||
return config
|
||||
except Exception as e:
|
||||
print(f"Erreur lors du chargement de la configuration email: {e}")
|
||||
|
||||
# Configuration par défaut
|
||||
default_config = {
|
||||
"smtp_server": "smtp.gmail.com",
|
||||
"smtp_port": 587,
|
||||
"username": "",
|
||||
"password": "",
|
||||
"sender_name": "Suite Consultance",
|
||||
"sender_email": ""
|
||||
}
|
||||
print(f"Using default email config: {default_config}")
|
||||
return default_config
|
||||
|
||||
def save_config(self, config):
|
||||
"""Sauvegarde la configuration email"""
|
||||
config_dir = os.path.dirname(self.config_file)
|
||||
if not os.path.exists(config_dir):
|
||||
os.makedirs(config_dir)
|
||||
|
||||
with open(self.config_file, 'w') as f:
|
||||
json.dump(config, f, indent=4)
|
||||
|
||||
self.config = config
|
||||
return True
|
||||
|
||||
def send_email(self, to_email, subject, body, cc=None, bcc=None):
|
||||
"""Envoie un email à un destinataire"""
|
||||
if not self.config.get('username') or not self.config.get('password'):
|
||||
raise ValueError("La configuration email n'est pas complète")
|
||||
|
||||
message = MIMEMultipart()
|
||||
message["From"] = f"{self.config.get('sender_name')} <{self.config.get('sender_email')}>"
|
||||
message["To"] = to_email
|
||||
message["Subject"] = subject
|
||||
|
||||
if cc:
|
||||
message["Cc"] = ", ".join(cc) if isinstance(cc, list) else cc
|
||||
if bcc:
|
||||
message["Bcc"] = ", ".join(bcc) if isinstance(bcc, list) else bcc
|
||||
|
||||
message.attach(MIMEText(body, "html"))
|
||||
|
||||
try:
|
||||
server = smtplib.SMTP(self.config.get('smtp_server'), self.config.get('smtp_port'))
|
||||
server.starttls()
|
||||
server.login(self.config.get('username'), self.config.get('password'))
|
||||
|
||||
recipients = [to_email]
|
||||
if cc:
|
||||
recipients.extend(cc if isinstance(cc, list) else [cc])
|
||||
if bcc:
|
||||
recipients.extend(bcc if isinstance(bcc, list) else [bcc])
|
||||
|
||||
server.sendmail(self.config.get('sender_email'), recipients, message.as_string())
|
||||
server.quit()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"to": to_email,
|
||||
"subject": subject
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def send_templated_email(self, to_email, template_id, context=None, cc=None, bcc=None):
|
||||
"""Envoie un email basé sur un template à un destinataire"""
|
||||
rendered = self.template_manager.render_template(template_id, context)
|
||||
if not rendered:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Template not found",
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return self.send_email(to_email, rendered['subject'], rendered['content'], cc, bcc)
|
||||
|
||||
def send_bulk_email(self, emails, subject, body, cc=None, bcc=None):
|
||||
"""Envoie le même email à plusieurs destinataires"""
|
||||
results = []
|
||||
for email in emails:
|
||||
result = self.send_email(email, subject, body, cc, bcc)
|
||||
results.append({
|
||||
"email": email,
|
||||
**result
|
||||
})
|
||||
return results
|
||||
|
||||
def send_bulk_templated_email(self, recipients, template_id, cc=None, bcc=None):
|
||||
"""
|
||||
Envoie un email basé sur un template à plusieurs destinataires
|
||||
recipients: liste de dictionnaires contenant l'email du destinataire et le contexte
|
||||
[{
|
||||
"email": "example@example.com",
|
||||
"context": {"name": "John Doe", "company": "ACME Inc."}
|
||||
}]
|
||||
"""
|
||||
results = []
|
||||
for recipient in recipients:
|
||||
email = recipient.get('email')
|
||||
context = recipient.get('context', {})
|
||||
result = self.send_templated_email(email, template_id, context, cc, bcc)
|
||||
results.append({
|
||||
"email": email,
|
||||
**result
|
||||
})
|
||||
return results
|
||||
|
||||
# ---------- Tracking helpers ----------
|
||||
def _embed_tracking(self, html_body: str, tracking_id: str, prospect_id: str) -> str:
|
||||
"""
|
||||
Ajoute un pixel d'ouverture et réécrit les liens pour le click tracking.
|
||||
Utilise APP_BASE_URL si définie, sinon génère des liens relatifs.
|
||||
"""
|
||||
base = (os.environ.get("APP_BASE_URL") or "").rstrip("/")
|
||||
prefix = f"{base}/tasks/t" # routes de tracking montées sur le blueprint 'tasks'
|
||||
# Pixel d'ouverture (1x1 PNG)
|
||||
pixel = f'<img src="{prefix}/o/{tracking_id}.png?pid={quote_plus(prospect_id)}" alt="" width="1" height="1" style="display:none;" />'
|
||||
body = html_body or ""
|
||||
# Injection du pixel avant la fermeture de body si possible
|
||||
if "</body>" in body.lower():
|
||||
# trouver la vraie balise en conservant la casse
|
||||
idx = body.lower().rfind("</body>")
|
||||
body = body[:idx] + pixel + body[idx:]
|
||||
else:
|
||||
body = body + pixel
|
||||
|
||||
# Réécriture des liens <a href="...">
|
||||
def _rewrite(match):
|
||||
url = match.group(1)
|
||||
# ignore si déjà tracké
|
||||
if "/tasks/t/c/" in url:
|
||||
return f'href="{url}"'
|
||||
tracked = f'{prefix}/c/{tracking_id}?u={quote_plus(url)}'
|
||||
return f'href="{tracked}"'
|
||||
|
||||
body = re.sub(r'href="([^"]+)"', _rewrite, body)
|
||||
return body
|
||||
|
||||
def send_tracked_email(self, to_email: str, subject: str, body: str, prospect_id: str, template_id: str = None, cc=None, bcc=None) -> Dict[str, Any]:
|
||||
"""
|
||||
Envoie un email avec tracking (open/click).
|
||||
Crée un enregistrement de tracking et insère un pixel + réécriture des liens.
|
||||
"""
|
||||
tracking_id = f"trk_{uuid4().hex[:16]}"
|
||||
# Créer l'enregistrement de tracking
|
||||
try:
|
||||
from modules.tracking.store import TrackingStore
|
||||
store = TrackingStore()
|
||||
store.create_record(tracking_id, {
|
||||
"prospect_id": prospect_id,
|
||||
"to": to_email,
|
||||
"subject": subject,
|
||||
"template_id": template_id,
|
||||
"opens": 0,
|
||||
"clicks": 0,
|
||||
})
|
||||
except Exception:
|
||||
# même si le tracking store échoue, on tente d'envoyer l'email
|
||||
pass
|
||||
|
||||
tracked_body = self._embed_tracking(body, tracking_id, prospect_id)
|
||||
result = self.send_email(to_email, subject, tracked_body, cc, bcc)
|
||||
result["tracking_id"] = tracking_id
|
||||
return result
|
||||
|
||||
|
||||
class EmailHistory:
|
||||
"""Classe gérant l'historique des emails envoyés"""
|
||||
|
||||
def __init__(self, history_folder="Data/email_history"):
|
||||
self.history_folder = history_folder
|
||||
|
||||
# Créer le dossier d'historique s'il n'existe pas
|
||||
if not os.path.exists(self.history_folder):
|
||||
os.makedirs(self.history_folder)
|
||||
|
||||
def add_email_record(self, prospect_id, email_data):
|
||||
"""Ajoute un email à l'historique d'un prospect"""
|
||||
history_file = os.path.join(self.history_folder, f"{prospect_id}.json")
|
||||
|
||||
# Charger l'historique existant
|
||||
history = []
|
||||
if os.path.exists(history_file):
|
||||
try:
|
||||
with open(history_file, 'r') as f:
|
||||
history = json.load(f)
|
||||
except:
|
||||
history = []
|
||||
|
||||
# Ajouter le nouvel email à l'historique
|
||||
history.append({
|
||||
**email_data,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
})
|
||||
|
||||
# Sauvegarder l'historique
|
||||
with open(history_file, 'w') as f:
|
||||
json.dump(history, f, indent=4)
|
||||
|
||||
return True
|
||||
|
||||
def get_prospect_email_history(self, prospect_id):
|
||||
"""Récupère l'historique des emails pour un prospect"""
|
||||
history_file = os.path.join(self.history_folder, f"{prospect_id}.json")
|
||||
|
||||
if os.path.exists(history_file):
|
||||
try:
|
||||
with open(history_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except:
|
||||
return []
|
||||
|
||||
return []
|
||||
|
||||
def get_all_email_history(self):
|
||||
"""Récupère l'historique de tous les emails envoyés"""
|
||||
all_history = {}
|
||||
|
||||
if os.path.exists(self.history_folder):
|
||||
for filename in os.listdir(self.history_folder):
|
||||
if filename.endswith('.json'):
|
||||
prospect_id = filename.split('.')[0]
|
||||
history_file = os.path.join(self.history_folder, filename)
|
||||
|
||||
try:
|
||||
with open(history_file, 'r') as f:
|
||||
all_history[prospect_id] = json.load(f)
|
||||
except:
|
||||
all_history[prospect_id] = []
|
||||
|
||||
return all_history
|
||||
968
modules/email/email_scraper.py
Normal file
968
modules/email/email_scraper.py
Normal file
|
|
@ -0,0 +1,968 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time
|
||||
from typing import List, Set, Dict
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
class EmailScraper:
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
})
|
||||
self.email_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}')
|
||||
self.phone_pattern = re.compile(r'(?:\+32|0)\s?[1-9](?:[\s\-\.\/]?\d){8}|\+32\s?[1-9](?:[\s\-\.\/]?\d){8}|(?:\+33|0)[1-9](?:[\s\-\.\/]?\d){8}')
|
||||
self.visited_urls = set()
|
||||
self.found_emails = set()
|
||||
self.contact_info = {}
|
||||
|
||||
def scrape_page(self, url: str, max_pages: int = 10) -> Dict:
|
||||
"""
|
||||
Scrape une page avec pagination pour extraire les données d'entreprises
|
||||
"""
|
||||
results = {
|
||||
'url': url,
|
||||
'contacts': [], # Liste des contacts avec email, nom, téléphone, etc.
|
||||
'pages_scraped': [],
|
||||
'errors': [],
|
||||
'start_time': datetime.now().isoformat(),
|
||||
'end_time': None,
|
||||
'domain_info': {}
|
||||
}
|
||||
|
||||
try:
|
||||
self._scrape_with_pagination(url, results, max_pages)
|
||||
self._extract_domain_info(url, results)
|
||||
except Exception as e:
|
||||
results['errors'].append(f"Erreur générale: {str(e)}")
|
||||
|
||||
results['end_time'] = datetime.now().isoformat()
|
||||
|
||||
return results
|
||||
|
||||
def _scrape_with_pagination(self, base_url: str, results: Dict, max_pages: int):
|
||||
"""
|
||||
Scraper avec gestion de la pagination
|
||||
"""
|
||||
current_page = 1
|
||||
current_url = base_url
|
||||
|
||||
while current_page <= max_pages:
|
||||
if current_url in self.visited_urls:
|
||||
break
|
||||
|
||||
try:
|
||||
# Normaliser l'URL
|
||||
parsed_url = urlparse(current_url)
|
||||
if not parsed_url.scheme:
|
||||
current_url = 'https://' + current_url
|
||||
|
||||
self.visited_urls.add(current_url)
|
||||
|
||||
print(f"Scraping page {current_page}: {current_url}")
|
||||
|
||||
# Faire la requête
|
||||
response = self.session.get(current_url, timeout=15)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parser le HTML
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Extraire les entreprises/contacts de la page
|
||||
page_contacts = self._extract_business_contacts(soup, response.text, current_url)
|
||||
|
||||
# Ajouter les contacts à la liste principale
|
||||
for contact in page_contacts:
|
||||
# Vérifier si ce contact existe déjà (par email)
|
||||
existing_contact = next((c for c in results['contacts'] if c['email'] == contact['email']), None)
|
||||
if existing_contact:
|
||||
# Fusionner les informations si le contact existe
|
||||
self._merge_contact_info(existing_contact, contact)
|
||||
else:
|
||||
results['contacts'].append(contact)
|
||||
|
||||
results['pages_scraped'].append({
|
||||
'url': current_url,
|
||||
'page_number': current_page,
|
||||
'contacts_found': len(page_contacts),
|
||||
'contacts': page_contacts,
|
||||
'status': 'success',
|
||||
'timestamp': datetime.now().isoformat()
|
||||
})
|
||||
|
||||
print(f" - Page {current_page}: Trouvé {len(page_contacts)} contact(s)")
|
||||
|
||||
# Si aucun contact trouvé, peut-être qu'on a atteint la fin
|
||||
if len(page_contacts) == 0:
|
||||
print(f" - Aucun contact trouvé sur la page {current_page}, arrêt du scraping")
|
||||
break
|
||||
|
||||
# Chercher le lien vers la page suivante
|
||||
next_url = self._find_next_page_url(soup, current_url, current_page)
|
||||
|
||||
if not next_url:
|
||||
print(f" - Pas de page suivante trouvée, arrêt du scraping")
|
||||
break
|
||||
|
||||
current_url = next_url
|
||||
current_page += 1
|
||||
|
||||
# Délai entre les pages pour éviter la surcharge
|
||||
time.sleep(2)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
results['errors'].append(f"Erreur de requête pour la page {current_page} ({current_url}): {str(e)}")
|
||||
results['pages_scraped'].append({
|
||||
'url': current_url,
|
||||
'page_number': current_page,
|
||||
'contacts_found': 0,
|
||||
'contacts': [],
|
||||
'status': 'error',
|
||||
'error': str(e),
|
||||
'timestamp': datetime.now().isoformat()
|
||||
})
|
||||
break
|
||||
except Exception as e:
|
||||
results['errors'].append(f"Erreur lors du parsing de la page {current_page}: {str(e)}")
|
||||
break
|
||||
|
||||
def _extract_business_contacts(self, soup: BeautifulSoup, text: str, page_url: str) -> List[Dict]:
|
||||
"""
|
||||
Extraire les informations d'entreprises d'une page (spécialisé pour les annuaires)
|
||||
"""
|
||||
contacts = []
|
||||
|
||||
# Chercher des conteneurs d'entreprises communs
|
||||
business_containers = self._find_business_containers(soup)
|
||||
|
||||
if business_containers:
|
||||
# Si on trouve des conteneurs structurés, les traiter
|
||||
for container in business_containers:
|
||||
contact = self._extract_contact_from_container(container, page_url)
|
||||
if contact and contact.get('email'):
|
||||
contacts.append(contact)
|
||||
else:
|
||||
# Fallback: extraction générale comme avant
|
||||
contacts = self._extract_contact_info(soup, text, page_url)
|
||||
|
||||
return contacts
|
||||
|
||||
def _find_business_containers(self, soup: BeautifulSoup) -> List:
|
||||
"""
|
||||
Trouver les conteneurs qui contiennent probablement des informations d'entreprises
|
||||
"""
|
||||
containers = []
|
||||
|
||||
# Patterns communs pour les annuaires d'entreprises
|
||||
business_selectors = [
|
||||
# Classes/IDs communs
|
||||
'[class*="business"]',
|
||||
'[class*="company"]',
|
||||
'[class*="enterprise"]',
|
||||
'[class*="contact"]',
|
||||
'[class*="listing"]',
|
||||
'[class*="directory"]',
|
||||
'[class*="card"]',
|
||||
'[class*="item"]',
|
||||
'[class*="entry"]',
|
||||
'[class*="result"]',
|
||||
# Balises sémantiques
|
||||
'article',
|
||||
'[itemtype*="Organization"]',
|
||||
'[itemtype*="LocalBusiness"]',
|
||||
# Structures de liste
|
||||
'li[class*="business"]',
|
||||
'li[class*="company"]',
|
||||
'div[class*="row"]',
|
||||
'div[class*="col"]'
|
||||
]
|
||||
|
||||
for selector in business_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Vérifier si l'élément contient des informations utiles
|
||||
if self._container_has_business_info(element):
|
||||
containers.append(element)
|
||||
except:
|
||||
continue
|
||||
|
||||
# Déduplication basée sur le contenu
|
||||
unique_containers = []
|
||||
for container in containers:
|
||||
if not any(self._containers_are_similar(container, existing) for existing in unique_containers):
|
||||
unique_containers.append(container)
|
||||
|
||||
return unique_containers[:50] # Limiter pour éviter la surcharge
|
||||
|
||||
def _container_has_business_info(self, container) -> bool:
|
||||
"""
|
||||
Vérifier si un conteneur a des informations d'entreprise
|
||||
"""
|
||||
text = container.get_text(strip=True).lower()
|
||||
|
||||
# Indicateurs d'informations d'entreprise
|
||||
business_indicators = [
|
||||
'@', 'email', 'mail', 'contact',
|
||||
'tel', 'phone', 'telephone', 'gsm',
|
||||
'rue', 'avenue', 'boulevard', 'place',
|
||||
'www.', 'http', '.com', '.be', '.fr',
|
||||
'sarl', 'sprl', 'sa', 'nv', 'bvba'
|
||||
]
|
||||
|
||||
score = sum(1 for indicator in business_indicators if indicator in text)
|
||||
return score >= 2 and len(text) > 20
|
||||
|
||||
def _containers_are_similar(self, container1, container2) -> bool:
|
||||
"""
|
||||
Vérifier si deux conteneurs sont similaires (pour éviter les doublons)
|
||||
"""
|
||||
text1 = container1.get_text(strip=True)
|
||||
text2 = container2.get_text(strip=True)
|
||||
|
||||
# Si les textes sont identiques ou très similaires
|
||||
if text1 == text2:
|
||||
return True
|
||||
|
||||
# Si un conteneur est inclus dans l'autre
|
||||
if len(text1) > len(text2):
|
||||
return text2 in text1
|
||||
else:
|
||||
return text1 in text2
|
||||
|
||||
def _extract_contact_from_container(self, container, page_url: str) -> Dict:
|
||||
"""
|
||||
Extraire les informations de contact d'un conteneur spécifique
|
||||
"""
|
||||
contact = {
|
||||
'email': '',
|
||||
'name': '',
|
||||
'first_name': '',
|
||||
'last_name': '',
|
||||
'company': '',
|
||||
'phone': '',
|
||||
'location': '',
|
||||
'source_url': page_url,
|
||||
'notes': ''
|
||||
}
|
||||
|
||||
# Extraire l'email depuis les balises individuelles d'abord
|
||||
email_found = False
|
||||
|
||||
# Chercher dans les liens mailto
|
||||
mailto_links = container.find_all('a', href=re.compile(r'^mailto:', re.I))
|
||||
if mailto_links:
|
||||
href = mailto_links[0].get('href', '')
|
||||
email_match = re.search(r'mailto:([^?&]+)', href, re.I)
|
||||
if email_match and self._is_valid_email(email_match.group(1)):
|
||||
contact['email'] = email_match.group(1).lower()
|
||||
email_found = True
|
||||
|
||||
# Si pas trouvé dans mailto, chercher dans les balises individuelles
|
||||
if not email_found:
|
||||
for element in container.find_all(['p', 'div', 'span', 'td', 'li']):
|
||||
element_text = element.get_text(strip=True)
|
||||
# Ajouter des espaces autour des balises pour éviter la concaténation
|
||||
element_text = ' ' + element_text + ' '
|
||||
|
||||
email_matches = self.email_pattern.findall(element_text)
|
||||
if email_matches:
|
||||
for email in email_matches:
|
||||
email = email.strip()
|
||||
if re.match(r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$', email) and self._is_valid_email(email):
|
||||
contact['email'] = email.lower()
|
||||
email_found = True
|
||||
break
|
||||
if email_found:
|
||||
break
|
||||
|
||||
# Si toujours pas trouvé, chercher dans le texte global avec des patterns plus précis
|
||||
if not email_found:
|
||||
container_text = container.get_text(separator=' ', strip=True) # Utiliser un séparateur
|
||||
|
||||
# Patterns avec contexte pour éviter la capture parasite
|
||||
context_patterns = [
|
||||
r'(?:email|e-mail|mail|contact)\s*:?\s*([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})',
|
||||
r'([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})(?=\s|$|[^\w.-])',
|
||||
]
|
||||
|
||||
for pattern in context_patterns:
|
||||
matches = re.findall(pattern, container_text, re.IGNORECASE)
|
||||
if matches:
|
||||
email = matches[0] if isinstance(matches[0], str) else matches[0][0] if matches[0] else ''
|
||||
if email and self._is_valid_email(email):
|
||||
contact['email'] = email.lower()
|
||||
email_found = True
|
||||
break
|
||||
|
||||
# Extraire le téléphone
|
||||
container_text = container.get_text(separator=' ', strip=True)
|
||||
phone_matches = self.phone_pattern.findall(container_text)
|
||||
if phone_matches:
|
||||
# Prendre le premier numéro et le nettoyer
|
||||
phone = phone_matches[0]
|
||||
# S'assurer qu'on n'a que des chiffres, espaces, tirets, points, slash et +
|
||||
clean_phone = re.sub(r'[^0-9\s\-\.\/\+].*$', '', phone)
|
||||
contact['phone'] = clean_phone.strip()
|
||||
|
||||
# Extraire le nom de l'entreprise
|
||||
contact['company'] = self._extract_company_name(container, container_text)
|
||||
|
||||
# Extraire les noms de personnes
|
||||
names = self._extract_person_names(container, container_text)
|
||||
if names:
|
||||
contact.update(names)
|
||||
|
||||
# Extraire la localisation
|
||||
contact['location'] = self._extract_location_from_container(container, container_text)
|
||||
|
||||
# Enrichir avec des informations contextuelles
|
||||
self._enhance_business_contact(contact, container, container_text)
|
||||
|
||||
return contact if contact['email'] or contact['company'] else None
|
||||
|
||||
def _extract_company_name(self, container, text: str) -> str:
|
||||
"""
|
||||
Extraire le nom de l'entreprise d'un conteneur
|
||||
"""
|
||||
# Chercher dans les balises title, h1-h6, strong, b
|
||||
title_elements = container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', '[class*="title"]', '[class*="name"]', '[class*="company"]'])
|
||||
|
||||
for element in title_elements:
|
||||
company_text = element.get_text(strip=True)
|
||||
if len(company_text) > 2 and len(company_text) < 100:
|
||||
# Éviter les textes trop génériques
|
||||
if not any(generic in company_text.lower() for generic in ['accueil', 'contact', 'email', 'téléphone', 'adresse']):
|
||||
return company_text
|
||||
|
||||
# Fallback: prendre la première ligne non-vide qui semble être un nom
|
||||
lines = text.split('\n')
|
||||
for line in lines[:3]: # Les 3 premières lignes
|
||||
line = line.strip()
|
||||
if len(line) > 2 and len(line) < 100 and not '@' in line and not any(char.isdigit() for char in line[:3]):
|
||||
return line
|
||||
|
||||
return ''
|
||||
|
||||
def _extract_person_names(self, container, text: str) -> Dict:
|
||||
"""
|
||||
Extraire les noms de personnes
|
||||
"""
|
||||
names = {'name': '', 'first_name': '', 'last_name': ''}
|
||||
|
||||
# Patterns pour les noms de personnes
|
||||
name_patterns = [
|
||||
r'\b([A-Z][a-zÀ-ÿ]+)\s+([A-Z][a-zÀ-ÿ]+)\b', # Prénom Nom
|
||||
r'\b([A-Z][A-Z]+)\s+([A-Z][a-zÀ-ÿ]+)\b', # NOM Prénom
|
||||
]
|
||||
|
||||
# Chercher dans les balises spécifiques
|
||||
name_elements = container.find_all(['[class*="name"]', '[class*="contact"]', '[class*="person"]'])
|
||||
|
||||
for element in name_elements:
|
||||
element_text = element.get_text(strip=True)
|
||||
for pattern in name_patterns:
|
||||
match = re.search(pattern, element_text)
|
||||
if match:
|
||||
names['first_name'] = match.group(1)
|
||||
names['last_name'] = match.group(2)
|
||||
names['name'] = f"{names['first_name']} {names['last_name']}"
|
||||
return names
|
||||
|
||||
# Si pas trouvé dans les balises, chercher dans le texte
|
||||
for pattern in name_patterns:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
names['first_name'] = match.group(1)
|
||||
names['last_name'] = match.group(2)
|
||||
names['name'] = f"{names['first_name']} {names['last_name']}"
|
||||
break
|
||||
|
||||
return names
|
||||
|
||||
def _extract_location_from_container(self, container, text: str) -> str:
|
||||
"""
|
||||
Extraire la localisation d'un conteneur
|
||||
"""
|
||||
# Chercher dans les balises d'adresse
|
||||
address_elements = container.find_all(['address', '[class*="address"]', '[class*="location"]', '[class*="ville"]', '[class*="city"]'])
|
||||
|
||||
for element in address_elements:
|
||||
location_text = element.get_text(strip=True)
|
||||
if len(location_text) > 5:
|
||||
return location_text
|
||||
|
||||
# Patterns pour les adresses belges/françaises
|
||||
location_patterns = [
|
||||
r'\b\d{4,5}\s+[A-Za-zÀ-ÿ\s\-]+\b', # Code postal + ville
|
||||
r'\b[A-Za-zÀ-ÿ\s\-]+,\s*[A-Za-zÀ-ÿ\s\-]+\b', # Ville, Région/Pays
|
||||
r'\b(?:rue|avenue|boulevard|place|chemin)\s+[A-Za-zÀ-ÿ\s\d\-,]+\b' # Adresse complète
|
||||
]
|
||||
|
||||
for pattern in location_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(0).strip()
|
||||
|
||||
return ''
|
||||
|
||||
def _enhance_business_contact(self, contact: Dict, container, text: str):
|
||||
"""
|
||||
Améliorer les informations de contact d'entreprise
|
||||
"""
|
||||
# Si pas de nom trouvé, essayer d'extraire depuis l'email
|
||||
if not contact['name'] and contact['email']:
|
||||
local_part = contact['email'].split('@')[0]
|
||||
domain_part = contact['email'].split('@')[1]
|
||||
|
||||
if '.' in local_part:
|
||||
parts = local_part.split('.')
|
||||
contact['first_name'] = parts[0].title()
|
||||
contact['last_name'] = parts[1].title() if len(parts) > 1 else ''
|
||||
contact['name'] = f"{contact['first_name']} {contact['last_name']}".strip()
|
||||
|
||||
# Si pas d'entreprise, essayer de deviner depuis le domaine
|
||||
if not contact['company']:
|
||||
company_name = domain_part.split('.')[0]
|
||||
contact['company'] = company_name.title()
|
||||
|
||||
# Enrichir les notes avec des informations contextuelles
|
||||
notes_parts = []
|
||||
|
||||
# Chercher des informations sur l'activité
|
||||
activity_patterns = [
|
||||
r'(?i)\b(restaurant|café|boulangerie|pharmacie|garage|coiffeur|médecin|avocat|comptable|architecte|dentiste|vétérinaire|magasin|boutique|salon)\b',
|
||||
r'(?i)\b(commerce|service|entreprise|société|bureau|cabinet|clinique|centre|institut)\b'
|
||||
]
|
||||
|
||||
for pattern in activity_patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
if matches:
|
||||
notes_parts.append(f"Activité: {', '.join(set(matches))}")
|
||||
break
|
||||
|
||||
# Chercher des horaires
|
||||
horaires_pattern = r'(?i)(?:ouvert|fermé|horaires?)[:\s]*([^.!?\n]{10,50})'
|
||||
horaires_match = re.search(horaires_pattern, text)
|
||||
if horaires_match:
|
||||
notes_parts.append(f"Horaires: {horaires_match.group(1).strip()}")
|
||||
|
||||
# Chercher un site web
|
||||
website_pattern = r'\b(?:www\.)?[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.(?:com|be|fr|org|net)\b'
|
||||
website_match = re.search(website_pattern, text)
|
||||
if website_match:
|
||||
notes_parts.append(f"Site web: {website_match.group(0)}")
|
||||
|
||||
contact['notes'] = ' | '.join(notes_parts)
|
||||
|
||||
def _find_next_page_url(self, soup: BeautifulSoup, current_url: str, current_page: int) -> str:
|
||||
"""
|
||||
Trouver l'URL de la page suivante
|
||||
"""
|
||||
base_url = '/'.join(current_url.split('/')[:-1]) if '/' in current_url else current_url
|
||||
|
||||
# Patterns communs pour les liens de pagination
|
||||
next_patterns = [
|
||||
# Liens avec texte
|
||||
'a[href]:contains("Suivant")',
|
||||
'a[href]:contains("Next")',
|
||||
'a[href]:contains(">")',
|
||||
'a[href]:contains("Page suivante")',
|
||||
# Liens avec classes
|
||||
'a[class*="next"]',
|
||||
'a[class*="suivant"]',
|
||||
'a[class*="pagination"]',
|
||||
# Numéros de page
|
||||
f'a[href]:contains("{current_page + 1}")',
|
||||
]
|
||||
|
||||
for pattern in next_patterns:
|
||||
try:
|
||||
links = soup.select(pattern)
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if href:
|
||||
# Construire l'URL complète
|
||||
if href.startswith('http'):
|
||||
return href
|
||||
elif href.startswith('/'):
|
||||
parsed = urlparse(current_url)
|
||||
return f"{parsed.scheme}://{parsed.netloc}{href}"
|
||||
else:
|
||||
return urljoin(current_url, href)
|
||||
except:
|
||||
continue
|
||||
|
||||
# Essayer de construire l'URL de la page suivante par pattern
|
||||
# Pattern 1: ?page=X
|
||||
if 'page=' in current_url:
|
||||
return re.sub(r'page=\d+', f'page={current_page + 1}', current_url)
|
||||
|
||||
# Pattern 2: /pageX
|
||||
if f'/page{current_page}' in current_url:
|
||||
return current_url.replace(f'/page{current_page}', f'/page{current_page + 1}')
|
||||
|
||||
# Pattern 3: Ajouter ?page=2 si c'est la première page
|
||||
if current_page == 1:
|
||||
separator = '&' if '?' in current_url else '?'
|
||||
return f"{current_url}{separator}page={current_page + 1}"
|
||||
|
||||
return None
|
||||
|
||||
def _extract_contact_info(self, soup: BeautifulSoup, text: str, page_url: str) -> List[Dict]:
|
||||
"""
|
||||
Extraire les informations de contact complètes d'une page
|
||||
"""
|
||||
contacts = []
|
||||
|
||||
# Extraire tous les emails
|
||||
emails = set()
|
||||
emails.update(self._extract_emails_from_text(text))
|
||||
emails.update(self._extract_emails_from_links(soup))
|
||||
|
||||
# Extraire les numéros de téléphone
|
||||
phones = self._extract_phone_numbers(text)
|
||||
|
||||
# Extraire les noms et entreprises depuis les balises structurées
|
||||
structured_contacts = self._extract_structured_contacts(soup)
|
||||
|
||||
# Extraire l'adresse/localité
|
||||
location = self._extract_location_info(soup, text)
|
||||
|
||||
# Créer des contacts pour chaque email trouvé
|
||||
for email in emails:
|
||||
if not self._is_valid_email(email):
|
||||
continue
|
||||
|
||||
contact = {
|
||||
'email': email.lower(),
|
||||
'name': '',
|
||||
'first_name': '',
|
||||
'last_name': '',
|
||||
'company': '',
|
||||
'phone': '',
|
||||
'location': location,
|
||||
'source_url': page_url,
|
||||
'notes': ''
|
||||
}
|
||||
|
||||
# Essayer de trouver des informations complémentaires
|
||||
self._enhance_contact_info(contact, soup, text, structured_contacts, phones)
|
||||
|
||||
contacts.append(contact)
|
||||
|
||||
return contacts
|
||||
|
||||
def _extract_phone_numbers(self, text: str) -> List[str]:
|
||||
"""
|
||||
Extraire les numéros de téléphone
|
||||
"""
|
||||
phones = []
|
||||
matches = self.phone_pattern.findall(text)
|
||||
|
||||
for phone in matches:
|
||||
# Nettoyer le numéro
|
||||
clean_phone = re.sub(r'[\s\-\.\/]', '', phone)
|
||||
if len(clean_phone) >= 9: # Numéro valide
|
||||
phones.append(phone)
|
||||
|
||||
return phones
|
||||
|
||||
def _extract_structured_contacts(self, soup: BeautifulSoup) -> List[Dict]:
|
||||
"""
|
||||
Extraire les contacts depuis les données structurées (microdata, JSON-LD, etc.)
|
||||
"""
|
||||
contacts = []
|
||||
|
||||
# Chercher les données JSON-LD
|
||||
json_scripts = soup.find_all('script', type='application/ld+json')
|
||||
for script in json_scripts:
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, dict):
|
||||
contact = self._parse_json_ld_contact(data)
|
||||
if contact:
|
||||
contacts.append(contact)
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
contact = self._parse_json_ld_contact(item)
|
||||
if contact:
|
||||
contacts.append(contact)
|
||||
except:
|
||||
continue
|
||||
|
||||
# Chercher les microdata
|
||||
contacts.extend(self._extract_microdata_contacts(soup))
|
||||
|
||||
return contacts
|
||||
|
||||
def _parse_json_ld_contact(self, data: Dict) -> Dict:
|
||||
"""
|
||||
Parser un contact depuis les données JSON-LD
|
||||
"""
|
||||
contact = {}
|
||||
|
||||
if data.get('@type') in ['Organization', 'LocalBusiness', 'Person']:
|
||||
contact['name'] = data.get('name', '')
|
||||
contact['company'] = data.get('name', '') if data.get('@type') != 'Person' else ''
|
||||
|
||||
# Email
|
||||
email = data.get('email')
|
||||
if email:
|
||||
contact['email'] = email
|
||||
|
||||
# Téléphone
|
||||
phone = data.get('telephone')
|
||||
if phone:
|
||||
contact['phone'] = phone
|
||||
|
||||
# Adresse
|
||||
address = data.get('address')
|
||||
if address:
|
||||
if isinstance(address, dict):
|
||||
location_parts = []
|
||||
if address.get('addressLocality'):
|
||||
location_parts.append(address['addressLocality'])
|
||||
if address.get('addressRegion'):
|
||||
location_parts.append(address['addressRegion'])
|
||||
if address.get('addressCountry'):
|
||||
location_parts.append(address['addressCountry'])
|
||||
contact['location'] = ', '.join(location_parts)
|
||||
elif isinstance(address, str):
|
||||
contact['location'] = address
|
||||
|
||||
return contact if contact.get('email') or contact.get('name') else None
|
||||
|
||||
def _extract_microdata_contacts(self, soup: BeautifulSoup) -> List[Dict]:
|
||||
"""
|
||||
Extraire les contacts depuis les microdata
|
||||
"""
|
||||
contacts = []
|
||||
|
||||
# Chercher les éléments avec itemtype Person ou Organization
|
||||
items = soup.find_all(attrs={'itemtype': re.compile(r'.*(Person|Organization|LocalBusiness).*')})
|
||||
|
||||
for item in items:
|
||||
contact = {}
|
||||
|
||||
# Nom
|
||||
name_elem = item.find(attrs={'itemprop': 'name'})
|
||||
if name_elem:
|
||||
contact['name'] = name_elem.get_text(strip=True)
|
||||
|
||||
# Email
|
||||
email_elem = item.find(attrs={'itemprop': 'email'})
|
||||
if email_elem:
|
||||
contact['email'] = email_elem.get('href', '').replace('mailto:', '') or email_elem.get_text(strip=True)
|
||||
|
||||
# Téléphone
|
||||
phone_elem = item.find(attrs={'itemprop': 'telephone'})
|
||||
if phone_elem:
|
||||
contact['phone'] = phone_elem.get_text(strip=True)
|
||||
|
||||
if contact.get('email') or contact.get('name'):
|
||||
contacts.append(contact)
|
||||
|
||||
return contacts
|
||||
|
||||
def _extract_location_info(self, soup: BeautifulSoup, text: str) -> str:
|
||||
"""
|
||||
Extraire les informations de localisation
|
||||
"""
|
||||
location_indicators = [
|
||||
r'\b\d{4,5}\s+[A-Za-zÀ-ÿ\s\-]+\b', # Code postal + ville
|
||||
r'\b[A-Za-zÀ-ÿ\s\-]+,\s*[A-Za-zÀ-ÿ\s\-]+\b', # Ville, Pays
|
||||
]
|
||||
|
||||
# Chercher dans les balises d'adresse
|
||||
address_tags = soup.find_all(['address', 'div'], class_=re.compile(r'.*address.*|.*location.*|.*contact.*'))
|
||||
for tag in address_tags:
|
||||
address_text = tag.get_text(strip=True)
|
||||
for pattern in location_indicators:
|
||||
match = re.search(pattern, address_text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(0)
|
||||
|
||||
# Chercher dans le texte global
|
||||
for pattern in location_indicators:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(0)
|
||||
|
||||
return ''
|
||||
|
||||
def _enhance_contact_info(self, contact: Dict, soup: BeautifulSoup, text: str, structured_contacts: List[Dict], phones: List[str]):
|
||||
"""
|
||||
Améliorer les informations de contact en croisant les données
|
||||
"""
|
||||
email = contact['email']
|
||||
|
||||
# Chercher dans les contacts structurés
|
||||
for struct_contact in structured_contacts:
|
||||
if struct_contact.get('email') == email:
|
||||
contact.update(struct_contact)
|
||||
break
|
||||
|
||||
# Si pas de nom trouvé, essayer d'extraire depuis l'email
|
||||
if not contact['name']:
|
||||
local_part = email.split('@')[0]
|
||||
domain_part = email.split('@')[1]
|
||||
|
||||
# Essayer de deviner le nom depuis la partie locale
|
||||
if '.' in local_part:
|
||||
parts = local_part.split('.')
|
||||
contact['first_name'] = parts[0].title()
|
||||
contact['last_name'] = parts[1].title() if len(parts) > 1 else ''
|
||||
contact['name'] = f"{contact['first_name']} {contact['last_name']}".strip()
|
||||
else:
|
||||
contact['name'] = local_part.title()
|
||||
|
||||
# Essayer de deviner l'entreprise depuis le domaine
|
||||
if not contact['company']:
|
||||
company_name = domain_part.split('.')[0]
|
||||
contact['company'] = company_name.title()
|
||||
|
||||
# Ajouter un numéro de téléphone si disponible
|
||||
if not contact['phone'] and phones:
|
||||
contact['phone'] = phones[0] # Prendre le premier numéro trouvé
|
||||
|
||||
# Enrichir les notes avec des informations contextuelles
|
||||
notes_parts = []
|
||||
if contact['location']:
|
||||
notes_parts.append(f"Localisation: {contact['location']}")
|
||||
|
||||
# Chercher des informations sur la fonction/titre
|
||||
title_patterns = [
|
||||
r'(?i)(?:directeur|manager|responsable|chef|président|ceo|cto|cfo)\s+[a-zA-ZÀ-ÿ\s]+',
|
||||
r'(?i)[a-zA-ZÀ-ÿ\s]+\s+(?:director|manager|head|chief|president)'
|
||||
]
|
||||
|
||||
for pattern in title_patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
if matches:
|
||||
notes_parts.append(f"Fonction possible: {matches[0]}")
|
||||
break
|
||||
|
||||
contact['notes'] = ' | '.join(notes_parts)
|
||||
|
||||
def _merge_contact_info(self, existing: Dict, new: Dict):
|
||||
"""
|
||||
Fusionner les informations de deux contacts
|
||||
"""
|
||||
for key, value in new.items():
|
||||
if value and not existing.get(key):
|
||||
existing[key] = value
|
||||
|
||||
# Fusionner les notes
|
||||
if new.get('notes') and existing.get('notes'):
|
||||
existing['notes'] = f"{existing['notes']} | {new['notes']}"
|
||||
elif new.get('notes'):
|
||||
existing['notes'] = new['notes']
|
||||
|
||||
def _extract_domain_info(self, url: str, results: Dict):
|
||||
"""
|
||||
Extraire les informations générales du domaine
|
||||
"""
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
results['domain_info'] = {
|
||||
'domain': domain,
|
||||
'company_guess': domain.split('.')[0].title(),
|
||||
'total_contacts': len(results['contacts']),
|
||||
'total_pages_scraped': len(results['pages_scraped'])
|
||||
}
|
||||
|
||||
def _extract_emails_from_links(self, soup: BeautifulSoup) -> Set[str]:
|
||||
"""
|
||||
Extraire les emails des liens mailto
|
||||
"""
|
||||
emails = set()
|
||||
|
||||
# Chercher les liens mailto
|
||||
mailto_links = soup.find_all('a', href=re.compile(r'^mailto:', re.I))
|
||||
for link in mailto_links:
|
||||
href = link.get('href', '')
|
||||
email_match = re.search(r'mailto:([^?&]+)', href, re.I)
|
||||
if email_match:
|
||||
email = email_match.group(1)
|
||||
if self._is_valid_email(email):
|
||||
emails.add(email.lower())
|
||||
|
||||
return emails
|
||||
|
||||
def _extract_emails_from_text(self, text: str) -> Set[str]:
|
||||
"""
|
||||
Extraire les emails du texte de la page
|
||||
"""
|
||||
emails = set()
|
||||
matches = self.email_pattern.findall(text)
|
||||
|
||||
for email in matches:
|
||||
# Filtrer les emails indésirables
|
||||
if not self._is_valid_email(email):
|
||||
continue
|
||||
emails.add(email.lower())
|
||||
|
||||
return emails
|
||||
|
||||
def _extract_internal_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
|
||||
"""
|
||||
Extraire les liens internes de la page
|
||||
"""
|
||||
links = []
|
||||
base_domain = urlparse(base_url).netloc
|
||||
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
full_url = urljoin(base_url, href)
|
||||
parsed_link = urlparse(full_url)
|
||||
|
||||
# Vérifier que c'est un lien interne et pas déjà visité
|
||||
if (parsed_link.netloc == base_domain and
|
||||
full_url not in self.visited_urls and
|
||||
not self._is_excluded_link(full_url)):
|
||||
links.append(full_url)
|
||||
|
||||
return links
|
||||
|
||||
def _is_valid_email(self, email: str) -> bool:
|
||||
"""
|
||||
Vérifier si l'email est valide et non indésirable
|
||||
"""
|
||||
# Filtrer les extensions de fichiers communes
|
||||
excluded_extensions = ['.jpg', '.png', '.gif', '.pdf', '.doc', '.css', '.js']
|
||||
|
||||
for ext in excluded_extensions:
|
||||
if email.lower().endswith(ext):
|
||||
return False
|
||||
|
||||
# Filtrer les emails génériques indésirables
|
||||
excluded_patterns = [
|
||||
'example.com',
|
||||
'test.com',
|
||||
'placeholder',
|
||||
'your-email',
|
||||
'youremail',
|
||||
'email@',
|
||||
'noreply',
|
||||
'no-reply'
|
||||
]
|
||||
|
||||
for pattern in excluded_patterns:
|
||||
if pattern in email.lower():
|
||||
return False
|
||||
|
||||
# Vérifier la longueur
|
||||
if len(email) < 5 or len(email) > 254:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_excluded_link(self, url: str) -> bool:
|
||||
"""
|
||||
Vérifier si le lien doit être exclu du scraping
|
||||
"""
|
||||
excluded_patterns = [
|
||||
'#',
|
||||
'javascript:',
|
||||
'tel:',
|
||||
'mailto:',
|
||||
'.pdf',
|
||||
'.doc',
|
||||
'.zip',
|
||||
'.jpg',
|
||||
'.png',
|
||||
'.gif'
|
||||
]
|
||||
|
||||
url_lower = url.lower()
|
||||
for pattern in excluded_patterns:
|
||||
if pattern in url_lower:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def save_results(self, results: Dict, filename: str = None) -> str:
|
||||
"""
|
||||
Sauvegarder les résultats dans un fichier JSON
|
||||
"""
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
domain = urlparse(results['url']).netloc.replace('.', '_')
|
||||
filename = f"scraping_{domain}_{timestamp}.json"
|
||||
|
||||
# Créer le dossier s'il n'existe pas
|
||||
scraping_folder = 'Data/email_scraping'
|
||||
os.makedirs(scraping_folder, exist_ok=True)
|
||||
|
||||
filepath = os.path.join(scraping_folder, filename)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return filepath
|
||||
|
||||
class EmailScrapingHistory:
|
||||
def __init__(self):
|
||||
self.history_folder = 'Data/email_scraping'
|
||||
os.makedirs(self.history_folder, exist_ok=True)
|
||||
|
||||
def get_all_scrapings(self) -> List[Dict]:
|
||||
"""
|
||||
Récupérer l'historique de tous les scrapings
|
||||
"""
|
||||
scrapings = []
|
||||
|
||||
for filename in os.listdir(self.history_folder):
|
||||
if filename.endswith('.json'):
|
||||
filepath = os.path.join(self.history_folder, filename)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
scrapings.append({
|
||||
'filename': filename,
|
||||
'url': data.get('url', ''),
|
||||
'emails_count': len(data.get('contacts', data.get('emails', []))), # Support pour ancienne et nouvelle structure
|
||||
'pages_count': len(data.get('pages_scraped', [])),
|
||||
'start_time': data.get('start_time', ''),
|
||||
'errors_count': len(data.get('errors', []))
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Erreur lors de la lecture de {filename}: {e}")
|
||||
|
||||
# Trier par date (plus récent d'abord)
|
||||
scrapings.sort(key=lambda x: x.get('start_time', ''), reverse=True)
|
||||
|
||||
return scrapings
|
||||
|
||||
def get_scraping_details(self, filename: str) -> Dict:
|
||||
"""
|
||||
Récupérer les détails d'un scraping spécifique
|
||||
"""
|
||||
filepath = os.path.join(self.history_folder, filename)
|
||||
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
return None
|
||||
|
||||
def delete_scraping(self, filename: str) -> bool:
|
||||
"""
|
||||
Supprimer un fichier de scraping
|
||||
"""
|
||||
filepath = os.path.join(self.history_folder, filename)
|
||||
|
||||
if os.path.exists(filepath):
|
||||
try:
|
||||
os.remove(filepath)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Erreur lors de la suppression: {e}")
|
||||
return False
|
||||
|
||||
return False
|
||||
Loading…
Add table
Add a link
Reference in a new issue