first commit

This commit is contained in:
mrtoine 2025-09-20 13:18:04 +02:00
commit e6c52820cd
227 changed files with 16156 additions and 0 deletions

70
modules/email/draft.py Normal file
View file

@ -0,0 +1,70 @@
from typing import Optional, Dict, Any
from datetime import datetime
import uuid
class EmailDraft:
"""
Représente un brouillon d'email généré automatiquement.
Stockage fichier: Data/email_drafts/<id>.json
"""
def __init__(
self,
prospect_id: str,
to_email: str,
subject: str,
content: str,
status: str = "draft", # draft | sent | failed
template_id: Optional[str] = None,
task_id: Optional[str] = None,
id: Optional[str] = None,
created_at: Optional[str] = None,
sent_at: Optional[str] = None,
error_message: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
):
self.id = id or f"ed_{uuid.uuid4().hex[:10]}"
self.prospect_id = prospect_id
self.to_email = to_email
self.subject = subject
self.content = content
self.status = status
self.template_id = template_id
self.task_id = task_id
self.created_at = created_at or datetime.utcnow().isoformat()
self.sent_at = sent_at
self.error_message = error_message
self.metadata = metadata or {}
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"prospect_id": self.prospect_id,
"to_email": self.to_email,
"subject": self.subject,
"content": self.content,
"status": self.status,
"template_id": self.template_id,
"task_id": self.task_id,
"created_at": self.created_at,
"sent_at": self.sent_at,
"error_message": self.error_message,
"metadata": self.metadata,
}
@staticmethod
def from_dict(data: Dict[str, Any]) -> "EmailDraft":
return EmailDraft(
id=data.get("id"),
prospect_id=data.get("prospect_id", ""),
to_email=data.get("to_email", ""),
subject=data.get("subject", ""),
content=data.get("content", ""),
status=data.get("status", "draft"),
template_id=data.get("template_id"),
task_id=data.get("task_id"),
created_at=data.get("created_at"),
sent_at=data.get("sent_at"),
error_message=data.get("error_message"),
metadata=data.get("metadata") or {},
)

View file

@ -0,0 +1,89 @@
import os
import json
from typing import List, Optional, Dict, Any
from datetime import datetime
from modules.email.draft import EmailDraft
class DraftHandler:
"""
Gestionnaire de brouillons (fichiers JSON).
Répertoire: Data/email_drafts
"""
def __init__(self, base_dir: Optional[str] = None):
base_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
self.base_dir = base_dir or os.path.join(base_root, "Data", "email_drafts")
os.makedirs(self.base_dir, exist_ok=True)
def _draft_path(self, draft_id: str) -> str:
return os.path.join(self.base_dir, f"{draft_id}.json")
def add_draft(self, draft: EmailDraft) -> str:
path = self._draft_path(draft.id)
with open(path, "w", encoding="utf-8") as f:
json.dump(draft.to_dict(), f, ensure_ascii=False, indent=2)
return draft.id
def get_draft(self, draft_id: str) -> Optional[EmailDraft]:
path = self._draft_path(draft_id)
if not os.path.exists(path):
return None
with open(path, "r", encoding="utf-8") as f:
return EmailDraft.from_dict(json.load(f))
def update_draft(self, draft: EmailDraft) -> bool:
path = self._draft_path(draft.id)
if not os.path.exists(path):
return False
with open(path, "w", encoding="utf-8") as f:
json.dump(draft.to_dict(), f, ensure_ascii=False, indent=2)
return True
def delete_draft(self, draft_id: str) -> bool:
path = self._draft_path(draft_id)
if os.path.exists(path):
try:
os.remove(path)
return True
except Exception:
return False
return False
def list_drafts(self, status: Optional[str] = None) -> List[EmailDraft]:
drafts: List[EmailDraft] = []
for filename in os.listdir(self.base_dir):
if filename.endswith(".json"):
try:
with open(os.path.join(self.base_dir, filename), "r", encoding="utf-8") as f:
data = json.load(f)
d = EmailDraft.from_dict(data)
if status is None or d.status == status:
drafts.append(d)
except Exception:
continue
# Tri: plus récents d'abord
drafts.sort(key=lambda d: d.created_at or "", reverse=True)
return drafts
def list_pending(self) -> List[EmailDraft]:
return self.list_drafts(status="draft")
def mark_sent(self, draft_id: str, success: bool, error_message: Optional[str] = None) -> bool:
d = self.get_draft(draft_id)
if not d:
return False
d.status = "sent" if success else "failed"
d.sent_at = datetime.utcnow().isoformat()
d.error_message = None if success else (error_message or "Unknown error")
return self.update_draft(d)
def find_existing_for_task(self, task_id: str) -> Optional[EmailDraft]:
"""
Évite les doublons: si un draft 'draft' existe déjà pour cette tâche, le renvoie.
Les brouillons 'failed' ne bloquent pas la régénération.
"""
for d in self.list_drafts():
if d.task_id == task_id and d.status == "draft":
return d
return None

View file

@ -0,0 +1,87 @@
from flask import Blueprint, request, redirect, url_for, flash, Response
from typing import List
from html import escape
from modules.email.draft_handler import DraftHandler
from modules.email.email_manager import EmailSender
email_drafts_bp = Blueprint("email_drafts", __name__, url_prefix="/email/drafts")
@email_drafts_bp.get("/")
def list_drafts_page():
"""
Page HTML minimaliste listant les brouillons avec bouton [Envoyer].
Pas de template Jinja requis (HTML inline pour simplicité d'intégration).
"""
handler = DraftHandler()
drafts = handler.list_pending()
def row_html(d):
# Contenu HTML tel quel (on suppose content déjà HTML sûr)
return f"""
<div style="border:1px solid #ddd; padding:12px; margin-bottom:12px; border-radius:8px;">
<div style="display:flex; justify-content:space-between; align-items:center;">
<h3 style="margin:0; font-size:1.05rem;">{escape(d.subject)}</h3>
<form method="post" action="{url_for('email_drafts.send_draft')}">
<input type="hidden" name="draft_id" value="{escape(d.id)}" />
<button type="submit" style="padding:6px 12px;">Envoyer</button>
</form>
</div>
<div style="color:#555; margin:6px 0 8px 0;">À: {escape(d.to_email)}</div>
<div style="background:#fafafa; padding:10px; border-radius:6px;">{d.content}</div>
<div style="font-size:12px; color:#777; margin-top:6px;">
Prospect: {escape(d.prospect_id)} | Template: {escape(d.template_id or '-') }
</div>
</div>
"""
items_html = "\n".join(row_html(d) for d in drafts) or "<p>Aucun brouillon à envoyer.</p>"
page = f"""
<!doctype html>
<html lang="fr">
<head>
<meta charset="utf-8">
<title>Brouillons d'emails</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
</head>
<body style="max-width:920px; margin: 20px auto; font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif;">
<h2>Brouillons d'emails à envoyer</h2>
<div>{items_html}</div>
</body>
</html>
"""
return Response(page, mimetype="text/html")
@email_drafts_bp.post("/send")
def send_draft():
"""
Envoi d'un brouillon sélectionné, puis mise à jour du statut.
"""
draft_id = (request.form.get("draft_id") or "").strip()
if not draft_id:
flash("Brouillon invalide", "warning")
return redirect(url_for("email_drafts.list_drafts_page"))
handler = DraftHandler()
draft = handler.get_draft(draft_id)
if not draft:
flash("Brouillon introuvable", "danger")
return redirect(url_for("email_drafts.list_drafts_page"))
sender = EmailSender()
try:
res = sender.send_email(draft.to_email, draft.subject, draft.content)
if res.get("success"):
handler.mark_sent(draft.id, success=True)
flash("Email envoyé.", "success")
else:
handler.mark_sent(draft.id, success=False, error_message=res.get("error"))
flash("Échec de l'envoi de l'email.", "danger")
except Exception as e:
handler.mark_sent(draft.id, success=False, error_message=str(e))
flash("Erreur lors de l'envoi de l'email.", "danger")
return redirect(url_for("email_drafts.list_drafts_page"))

View file

@ -0,0 +1,353 @@
from typing import List, Dict, Any, Union
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from datetime import datetime
import os
import json
import re
from urllib.parse import quote_plus
from uuid import uuid4
from core.data import Data
class EmailTemplate:
"""Classe gérant les templates d'emails"""
def __init__(self, template_folder="Data/email_templates"):
self.template_folder = template_folder
# Créer le dossier de templates s'il n'existe pas
if not os.path.exists(self.template_folder):
os.makedirs(self.template_folder)
def get_all_templates(self):
"""Retourne tous les templates disponibles"""
templates = []
if os.path.exists(self.template_folder):
for filename in os.listdir(self.template_folder):
if filename.endswith('.json'):
template_path = os.path.join(self.template_folder, filename)
try:
data_manager = Data(template_path)
template_data = data_manager.load_data()
templates.append(template_data)
except Exception as e:
print(f"Erreur lors du chargement du template {filename}: {e}")
return templates
def get_template_by_id(self, template_id):
"""Récupère un template par son ID"""
template_path = os.path.join(self.template_folder, f"{template_id}.json")
if os.path.exists(template_path):
data_manager = Data(template_path)
return data_manager.load_data()
return None
def save_template(self, template_data):
"""Sauvegarde un template d'email"""
template_id = template_data.get('id')
if not template_id:
# Générer un ID s'il n'existe pas
import uuid
template_id = f"tpl_{uuid.uuid4().hex[:8]}"
template_data['id'] = template_id
template_path = os.path.join(self.template_folder, f"{template_id}.json")
data_manager = Data(template_path)
data_manager.save_data(template_data)
return template_data
def delete_template(self, template_id):
"""Supprime un template d'email"""
template_path = os.path.join(self.template_folder, f"{template_id}.json")
if os.path.exists(template_path):
os.remove(template_path)
return True
return False
def render_template(self, template_id, context=None):
"""Rend un template avec les variables spécifiées dans le contexte"""
template = self.get_template_by_id(template_id)
if not template:
return None
subject = template.get('subject', '')
content = template.get('content', '')
# Remplacer les variables dans le sujet et le contenu
if context:
for key, value in context.items():
placeholder = f"{{{{{key}}}}}"
subject = subject.replace(placeholder, str(value))
content = content.replace(placeholder, str(value))
return {
"subject": subject,
"content": content
}
class EmailSender:
"""Classe gérant l'envoi d'emails"""
def __init__(self, config_file="config/email_config.json"):
# Ensure config_file is an absolute path
if not os.path.isabs(config_file):
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
self.config_file = os.path.join(base_dir, config_file)
else:
self.config_file = config_file
self.config = self._load_config()
self.template_manager = EmailTemplate()
def _load_config(self):
"""Charge la configuration email depuis le fichier de configuration"""
config_dir = os.path.dirname(self.config_file)
if not os.path.exists(config_dir):
os.makedirs(config_dir)
print(f"Loading email config from: {self.config_file}")
if os.path.exists(self.config_file):
try:
with open(self.config_file, 'r') as f:
config = json.load(f)
print(f"Loaded email config: {config}")
return config
except Exception as e:
print(f"Erreur lors du chargement de la configuration email: {e}")
# Configuration par défaut
default_config = {
"smtp_server": "smtp.gmail.com",
"smtp_port": 587,
"username": "",
"password": "",
"sender_name": "Suite Consultance",
"sender_email": ""
}
print(f"Using default email config: {default_config}")
return default_config
def save_config(self, config):
"""Sauvegarde la configuration email"""
config_dir = os.path.dirname(self.config_file)
if not os.path.exists(config_dir):
os.makedirs(config_dir)
with open(self.config_file, 'w') as f:
json.dump(config, f, indent=4)
self.config = config
return True
def send_email(self, to_email, subject, body, cc=None, bcc=None):
"""Envoie un email à un destinataire"""
if not self.config.get('username') or not self.config.get('password'):
raise ValueError("La configuration email n'est pas complète")
message = MIMEMultipart()
message["From"] = f"{self.config.get('sender_name')} <{self.config.get('sender_email')}>"
message["To"] = to_email
message["Subject"] = subject
if cc:
message["Cc"] = ", ".join(cc) if isinstance(cc, list) else cc
if bcc:
message["Bcc"] = ", ".join(bcc) if isinstance(bcc, list) else bcc
message.attach(MIMEText(body, "html"))
try:
server = smtplib.SMTP(self.config.get('smtp_server'), self.config.get('smtp_port'))
server.starttls()
server.login(self.config.get('username'), self.config.get('password'))
recipients = [to_email]
if cc:
recipients.extend(cc if isinstance(cc, list) else [cc])
if bcc:
recipients.extend(bcc if isinstance(bcc, list) else [bcc])
server.sendmail(self.config.get('sender_email'), recipients, message.as_string())
server.quit()
return {
"success": True,
"timestamp": datetime.now().isoformat(),
"to": to_email,
"subject": subject
}
except Exception as e:
return {
"success": False,
"error": str(e),
"timestamp": datetime.now().isoformat()
}
def send_templated_email(self, to_email, template_id, context=None, cc=None, bcc=None):
"""Envoie un email basé sur un template à un destinataire"""
rendered = self.template_manager.render_template(template_id, context)
if not rendered:
return {
"success": False,
"error": "Template not found",
"timestamp": datetime.now().isoformat()
}
return self.send_email(to_email, rendered['subject'], rendered['content'], cc, bcc)
def send_bulk_email(self, emails, subject, body, cc=None, bcc=None):
"""Envoie le même email à plusieurs destinataires"""
results = []
for email in emails:
result = self.send_email(email, subject, body, cc, bcc)
results.append({
"email": email,
**result
})
return results
def send_bulk_templated_email(self, recipients, template_id, cc=None, bcc=None):
"""
Envoie un email basé sur un template à plusieurs destinataires
recipients: liste de dictionnaires contenant l'email du destinataire et le contexte
[{
"email": "example@example.com",
"context": {"name": "John Doe", "company": "ACME Inc."}
}]
"""
results = []
for recipient in recipients:
email = recipient.get('email')
context = recipient.get('context', {})
result = self.send_templated_email(email, template_id, context, cc, bcc)
results.append({
"email": email,
**result
})
return results
# ---------- Tracking helpers ----------
def _embed_tracking(self, html_body: str, tracking_id: str, prospect_id: str) -> str:
"""
Ajoute un pixel d'ouverture et réécrit les liens pour le click tracking.
Utilise APP_BASE_URL si définie, sinon génère des liens relatifs.
"""
base = (os.environ.get("APP_BASE_URL") or "").rstrip("/")
prefix = f"{base}/tasks/t" # routes de tracking montées sur le blueprint 'tasks'
# Pixel d'ouverture (1x1 PNG)
pixel = f'<img src="{prefix}/o/{tracking_id}.png?pid={quote_plus(prospect_id)}" alt="" width="1" height="1" style="display:none;" />'
body = html_body or ""
# Injection du pixel avant la fermeture de body si possible
if "</body>" in body.lower():
# trouver la vraie balise en conservant la casse
idx = body.lower().rfind("</body>")
body = body[:idx] + pixel + body[idx:]
else:
body = body + pixel
# Réécriture des liens <a href="...">
def _rewrite(match):
url = match.group(1)
# ignore si déjà tracké
if "/tasks/t/c/" in url:
return f'href="{url}"'
tracked = f'{prefix}/c/{tracking_id}?u={quote_plus(url)}'
return f'href="{tracked}"'
body = re.sub(r'href="([^"]+)"', _rewrite, body)
return body
def send_tracked_email(self, to_email: str, subject: str, body: str, prospect_id: str, template_id: str = None, cc=None, bcc=None) -> Dict[str, Any]:
"""
Envoie un email avec tracking (open/click).
Crée un enregistrement de tracking et insère un pixel + réécriture des liens.
"""
tracking_id = f"trk_{uuid4().hex[:16]}"
# Créer l'enregistrement de tracking
try:
from modules.tracking.store import TrackingStore
store = TrackingStore()
store.create_record(tracking_id, {
"prospect_id": prospect_id,
"to": to_email,
"subject": subject,
"template_id": template_id,
"opens": 0,
"clicks": 0,
})
except Exception:
# même si le tracking store échoue, on tente d'envoyer l'email
pass
tracked_body = self._embed_tracking(body, tracking_id, prospect_id)
result = self.send_email(to_email, subject, tracked_body, cc, bcc)
result["tracking_id"] = tracking_id
return result
class EmailHistory:
"""Classe gérant l'historique des emails envoyés"""
def __init__(self, history_folder="Data/email_history"):
self.history_folder = history_folder
# Créer le dossier d'historique s'il n'existe pas
if not os.path.exists(self.history_folder):
os.makedirs(self.history_folder)
def add_email_record(self, prospect_id, email_data):
"""Ajoute un email à l'historique d'un prospect"""
history_file = os.path.join(self.history_folder, f"{prospect_id}.json")
# Charger l'historique existant
history = []
if os.path.exists(history_file):
try:
with open(history_file, 'r') as f:
history = json.load(f)
except:
history = []
# Ajouter le nouvel email à l'historique
history.append({
**email_data,
"timestamp": datetime.now().isoformat()
})
# Sauvegarder l'historique
with open(history_file, 'w') as f:
json.dump(history, f, indent=4)
return True
def get_prospect_email_history(self, prospect_id):
"""Récupère l'historique des emails pour un prospect"""
history_file = os.path.join(self.history_folder, f"{prospect_id}.json")
if os.path.exists(history_file):
try:
with open(history_file, 'r') as f:
return json.load(f)
except:
return []
return []
def get_all_email_history(self):
"""Récupère l'historique de tous les emails envoyés"""
all_history = {}
if os.path.exists(self.history_folder):
for filename in os.listdir(self.history_folder):
if filename.endswith('.json'):
prospect_id = filename.split('.')[0]
history_file = os.path.join(self.history_folder, filename)
try:
with open(history_file, 'r') as f:
all_history[prospect_id] = json.load(f)
except:
all_history[prospect_id] = []
return all_history

View file

@ -0,0 +1,968 @@
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import time
from typing import List, Set, Dict
import json
import os
from datetime import datetime
class EmailScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.email_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}')
self.phone_pattern = re.compile(r'(?:\+32|0)\s?[1-9](?:[\s\-\.\/]?\d){8}|\+32\s?[1-9](?:[\s\-\.\/]?\d){8}|(?:\+33|0)[1-9](?:[\s\-\.\/]?\d){8}')
self.visited_urls = set()
self.found_emails = set()
self.contact_info = {}
def scrape_page(self, url: str, max_pages: int = 10) -> Dict:
"""
Scrape une page avec pagination pour extraire les données d'entreprises
"""
results = {
'url': url,
'contacts': [], # Liste des contacts avec email, nom, téléphone, etc.
'pages_scraped': [],
'errors': [],
'start_time': datetime.now().isoformat(),
'end_time': None,
'domain_info': {}
}
try:
self._scrape_with_pagination(url, results, max_pages)
self._extract_domain_info(url, results)
except Exception as e:
results['errors'].append(f"Erreur générale: {str(e)}")
results['end_time'] = datetime.now().isoformat()
return results
def _scrape_with_pagination(self, base_url: str, results: Dict, max_pages: int):
"""
Scraper avec gestion de la pagination
"""
current_page = 1
current_url = base_url
while current_page <= max_pages:
if current_url in self.visited_urls:
break
try:
# Normaliser l'URL
parsed_url = urlparse(current_url)
if not parsed_url.scheme:
current_url = 'https://' + current_url
self.visited_urls.add(current_url)
print(f"Scraping page {current_page}: {current_url}")
# Faire la requête
response = self.session.get(current_url, timeout=15)
response.raise_for_status()
# Parser le HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Extraire les entreprises/contacts de la page
page_contacts = self._extract_business_contacts(soup, response.text, current_url)
# Ajouter les contacts à la liste principale
for contact in page_contacts:
# Vérifier si ce contact existe déjà (par email)
existing_contact = next((c for c in results['contacts'] if c['email'] == contact['email']), None)
if existing_contact:
# Fusionner les informations si le contact existe
self._merge_contact_info(existing_contact, contact)
else:
results['contacts'].append(contact)
results['pages_scraped'].append({
'url': current_url,
'page_number': current_page,
'contacts_found': len(page_contacts),
'contacts': page_contacts,
'status': 'success',
'timestamp': datetime.now().isoformat()
})
print(f" - Page {current_page}: Trouvé {len(page_contacts)} contact(s)")
# Si aucun contact trouvé, peut-être qu'on a atteint la fin
if len(page_contacts) == 0:
print(f" - Aucun contact trouvé sur la page {current_page}, arrêt du scraping")
break
# Chercher le lien vers la page suivante
next_url = self._find_next_page_url(soup, current_url, current_page)
if not next_url:
print(f" - Pas de page suivante trouvée, arrêt du scraping")
break
current_url = next_url
current_page += 1
# Délai entre les pages pour éviter la surcharge
time.sleep(2)
except requests.exceptions.RequestException as e:
results['errors'].append(f"Erreur de requête pour la page {current_page} ({current_url}): {str(e)}")
results['pages_scraped'].append({
'url': current_url,
'page_number': current_page,
'contacts_found': 0,
'contacts': [],
'status': 'error',
'error': str(e),
'timestamp': datetime.now().isoformat()
})
break
except Exception as e:
results['errors'].append(f"Erreur lors du parsing de la page {current_page}: {str(e)}")
break
def _extract_business_contacts(self, soup: BeautifulSoup, text: str, page_url: str) -> List[Dict]:
"""
Extraire les informations d'entreprises d'une page (spécialisé pour les annuaires)
"""
contacts = []
# Chercher des conteneurs d'entreprises communs
business_containers = self._find_business_containers(soup)
if business_containers:
# Si on trouve des conteneurs structurés, les traiter
for container in business_containers:
contact = self._extract_contact_from_container(container, page_url)
if contact and contact.get('email'):
contacts.append(contact)
else:
# Fallback: extraction générale comme avant
contacts = self._extract_contact_info(soup, text, page_url)
return contacts
def _find_business_containers(self, soup: BeautifulSoup) -> List:
"""
Trouver les conteneurs qui contiennent probablement des informations d'entreprises
"""
containers = []
# Patterns communs pour les annuaires d'entreprises
business_selectors = [
# Classes/IDs communs
'[class*="business"]',
'[class*="company"]',
'[class*="enterprise"]',
'[class*="contact"]',
'[class*="listing"]',
'[class*="directory"]',
'[class*="card"]',
'[class*="item"]',
'[class*="entry"]',
'[class*="result"]',
# Balises sémantiques
'article',
'[itemtype*="Organization"]',
'[itemtype*="LocalBusiness"]',
# Structures de liste
'li[class*="business"]',
'li[class*="company"]',
'div[class*="row"]',
'div[class*="col"]'
]
for selector in business_selectors:
try:
elements = soup.select(selector)
for element in elements:
# Vérifier si l'élément contient des informations utiles
if self._container_has_business_info(element):
containers.append(element)
except:
continue
# Déduplication basée sur le contenu
unique_containers = []
for container in containers:
if not any(self._containers_are_similar(container, existing) for existing in unique_containers):
unique_containers.append(container)
return unique_containers[:50] # Limiter pour éviter la surcharge
def _container_has_business_info(self, container) -> bool:
"""
Vérifier si un conteneur a des informations d'entreprise
"""
text = container.get_text(strip=True).lower()
# Indicateurs d'informations d'entreprise
business_indicators = [
'@', 'email', 'mail', 'contact',
'tel', 'phone', 'telephone', 'gsm',
'rue', 'avenue', 'boulevard', 'place',
'www.', 'http', '.com', '.be', '.fr',
'sarl', 'sprl', 'sa', 'nv', 'bvba'
]
score = sum(1 for indicator in business_indicators if indicator in text)
return score >= 2 and len(text) > 20
def _containers_are_similar(self, container1, container2) -> bool:
"""
Vérifier si deux conteneurs sont similaires (pour éviter les doublons)
"""
text1 = container1.get_text(strip=True)
text2 = container2.get_text(strip=True)
# Si les textes sont identiques ou très similaires
if text1 == text2:
return True
# Si un conteneur est inclus dans l'autre
if len(text1) > len(text2):
return text2 in text1
else:
return text1 in text2
def _extract_contact_from_container(self, container, page_url: str) -> Dict:
"""
Extraire les informations de contact d'un conteneur spécifique
"""
contact = {
'email': '',
'name': '',
'first_name': '',
'last_name': '',
'company': '',
'phone': '',
'location': '',
'source_url': page_url,
'notes': ''
}
# Extraire l'email depuis les balises individuelles d'abord
email_found = False
# Chercher dans les liens mailto
mailto_links = container.find_all('a', href=re.compile(r'^mailto:', re.I))
if mailto_links:
href = mailto_links[0].get('href', '')
email_match = re.search(r'mailto:([^?&]+)', href, re.I)
if email_match and self._is_valid_email(email_match.group(1)):
contact['email'] = email_match.group(1).lower()
email_found = True
# Si pas trouvé dans mailto, chercher dans les balises individuelles
if not email_found:
for element in container.find_all(['p', 'div', 'span', 'td', 'li']):
element_text = element.get_text(strip=True)
# Ajouter des espaces autour des balises pour éviter la concaténation
element_text = ' ' + element_text + ' '
email_matches = self.email_pattern.findall(element_text)
if email_matches:
for email in email_matches:
email = email.strip()
if re.match(r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$', email) and self._is_valid_email(email):
contact['email'] = email.lower()
email_found = True
break
if email_found:
break
# Si toujours pas trouvé, chercher dans le texte global avec des patterns plus précis
if not email_found:
container_text = container.get_text(separator=' ', strip=True) # Utiliser un séparateur
# Patterns avec contexte pour éviter la capture parasite
context_patterns = [
r'(?:email|e-mail|mail|contact)\s*:?\s*([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})',
r'([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})(?=\s|$|[^\w.-])',
]
for pattern in context_patterns:
matches = re.findall(pattern, container_text, re.IGNORECASE)
if matches:
email = matches[0] if isinstance(matches[0], str) else matches[0][0] if matches[0] else ''
if email and self._is_valid_email(email):
contact['email'] = email.lower()
email_found = True
break
# Extraire le téléphone
container_text = container.get_text(separator=' ', strip=True)
phone_matches = self.phone_pattern.findall(container_text)
if phone_matches:
# Prendre le premier numéro et le nettoyer
phone = phone_matches[0]
# S'assurer qu'on n'a que des chiffres, espaces, tirets, points, slash et +
clean_phone = re.sub(r'[^0-9\s\-\.\/\+].*$', '', phone)
contact['phone'] = clean_phone.strip()
# Extraire le nom de l'entreprise
contact['company'] = self._extract_company_name(container, container_text)
# Extraire les noms de personnes
names = self._extract_person_names(container, container_text)
if names:
contact.update(names)
# Extraire la localisation
contact['location'] = self._extract_location_from_container(container, container_text)
# Enrichir avec des informations contextuelles
self._enhance_business_contact(contact, container, container_text)
return contact if contact['email'] or contact['company'] else None
def _extract_company_name(self, container, text: str) -> str:
"""
Extraire le nom de l'entreprise d'un conteneur
"""
# Chercher dans les balises title, h1-h6, strong, b
title_elements = container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', '[class*="title"]', '[class*="name"]', '[class*="company"]'])
for element in title_elements:
company_text = element.get_text(strip=True)
if len(company_text) > 2 and len(company_text) < 100:
# Éviter les textes trop génériques
if not any(generic in company_text.lower() for generic in ['accueil', 'contact', 'email', 'téléphone', 'adresse']):
return company_text
# Fallback: prendre la première ligne non-vide qui semble être un nom
lines = text.split('\n')
for line in lines[:3]: # Les 3 premières lignes
line = line.strip()
if len(line) > 2 and len(line) < 100 and not '@' in line and not any(char.isdigit() for char in line[:3]):
return line
return ''
def _extract_person_names(self, container, text: str) -> Dict:
"""
Extraire les noms de personnes
"""
names = {'name': '', 'first_name': '', 'last_name': ''}
# Patterns pour les noms de personnes
name_patterns = [
r'\b([A-Z][a-zÀ-ÿ]+)\s+([A-Z][a-zÀ-ÿ]+)\b', # Prénom Nom
r'\b([A-Z][A-Z]+)\s+([A-Z][a-zÀ-ÿ]+)\b', # NOM Prénom
]
# Chercher dans les balises spécifiques
name_elements = container.find_all(['[class*="name"]', '[class*="contact"]', '[class*="person"]'])
for element in name_elements:
element_text = element.get_text(strip=True)
for pattern in name_patterns:
match = re.search(pattern, element_text)
if match:
names['first_name'] = match.group(1)
names['last_name'] = match.group(2)
names['name'] = f"{names['first_name']} {names['last_name']}"
return names
# Si pas trouvé dans les balises, chercher dans le texte
for pattern in name_patterns:
match = re.search(pattern, text)
if match:
names['first_name'] = match.group(1)
names['last_name'] = match.group(2)
names['name'] = f"{names['first_name']} {names['last_name']}"
break
return names
def _extract_location_from_container(self, container, text: str) -> str:
"""
Extraire la localisation d'un conteneur
"""
# Chercher dans les balises d'adresse
address_elements = container.find_all(['address', '[class*="address"]', '[class*="location"]', '[class*="ville"]', '[class*="city"]'])
for element in address_elements:
location_text = element.get_text(strip=True)
if len(location_text) > 5:
return location_text
# Patterns pour les adresses belges/françaises
location_patterns = [
r'\b\d{4,5}\s+[A-Za-zÀ-ÿ\s\-]+\b', # Code postal + ville
r'\b[A-Za-zÀ-ÿ\s\-]+,\s*[A-Za-zÀ-ÿ\s\-]+\b', # Ville, Région/Pays
r'\b(?:rue|avenue|boulevard|place|chemin)\s+[A-Za-zÀ-ÿ\s\d\-,]+\b' # Adresse complète
]
for pattern in location_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(0).strip()
return ''
def _enhance_business_contact(self, contact: Dict, container, text: str):
"""
Améliorer les informations de contact d'entreprise
"""
# Si pas de nom trouvé, essayer d'extraire depuis l'email
if not contact['name'] and contact['email']:
local_part = contact['email'].split('@')[0]
domain_part = contact['email'].split('@')[1]
if '.' in local_part:
parts = local_part.split('.')
contact['first_name'] = parts[0].title()
contact['last_name'] = parts[1].title() if len(parts) > 1 else ''
contact['name'] = f"{contact['first_name']} {contact['last_name']}".strip()
# Si pas d'entreprise, essayer de deviner depuis le domaine
if not contact['company']:
company_name = domain_part.split('.')[0]
contact['company'] = company_name.title()
# Enrichir les notes avec des informations contextuelles
notes_parts = []
# Chercher des informations sur l'activité
activity_patterns = [
r'(?i)\b(restaurant|café|boulangerie|pharmacie|garage|coiffeur|médecin|avocat|comptable|architecte|dentiste|vétérinaire|magasin|boutique|salon)\b',
r'(?i)\b(commerce|service|entreprise|société|bureau|cabinet|clinique|centre|institut)\b'
]
for pattern in activity_patterns:
matches = re.findall(pattern, text)
if matches:
notes_parts.append(f"Activité: {', '.join(set(matches))}")
break
# Chercher des horaires
horaires_pattern = r'(?i)(?:ouvert|fermé|horaires?)[:\s]*([^.!?\n]{10,50})'
horaires_match = re.search(horaires_pattern, text)
if horaires_match:
notes_parts.append(f"Horaires: {horaires_match.group(1).strip()}")
# Chercher un site web
website_pattern = r'\b(?:www\.)?[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.(?:com|be|fr|org|net)\b'
website_match = re.search(website_pattern, text)
if website_match:
notes_parts.append(f"Site web: {website_match.group(0)}")
contact['notes'] = ' | '.join(notes_parts)
def _find_next_page_url(self, soup: BeautifulSoup, current_url: str, current_page: int) -> str:
"""
Trouver l'URL de la page suivante
"""
base_url = '/'.join(current_url.split('/')[:-1]) if '/' in current_url else current_url
# Patterns communs pour les liens de pagination
next_patterns = [
# Liens avec texte
'a[href]:contains("Suivant")',
'a[href]:contains("Next")',
'a[href]:contains(">")',
'a[href]:contains("Page suivante")',
# Liens avec classes
'a[class*="next"]',
'a[class*="suivant"]',
'a[class*="pagination"]',
# Numéros de page
f'a[href]:contains("{current_page + 1}")',
]
for pattern in next_patterns:
try:
links = soup.select(pattern)
for link in links:
href = link.get('href')
if href:
# Construire l'URL complète
if href.startswith('http'):
return href
elif href.startswith('/'):
parsed = urlparse(current_url)
return f"{parsed.scheme}://{parsed.netloc}{href}"
else:
return urljoin(current_url, href)
except:
continue
# Essayer de construire l'URL de la page suivante par pattern
# Pattern 1: ?page=X
if 'page=' in current_url:
return re.sub(r'page=\d+', f'page={current_page + 1}', current_url)
# Pattern 2: /pageX
if f'/page{current_page}' in current_url:
return current_url.replace(f'/page{current_page}', f'/page{current_page + 1}')
# Pattern 3: Ajouter ?page=2 si c'est la première page
if current_page == 1:
separator = '&' if '?' in current_url else '?'
return f"{current_url}{separator}page={current_page + 1}"
return None
def _extract_contact_info(self, soup: BeautifulSoup, text: str, page_url: str) -> List[Dict]:
"""
Extraire les informations de contact complètes d'une page
"""
contacts = []
# Extraire tous les emails
emails = set()
emails.update(self._extract_emails_from_text(text))
emails.update(self._extract_emails_from_links(soup))
# Extraire les numéros de téléphone
phones = self._extract_phone_numbers(text)
# Extraire les noms et entreprises depuis les balises structurées
structured_contacts = self._extract_structured_contacts(soup)
# Extraire l'adresse/localité
location = self._extract_location_info(soup, text)
# Créer des contacts pour chaque email trouvé
for email in emails:
if not self._is_valid_email(email):
continue
contact = {
'email': email.lower(),
'name': '',
'first_name': '',
'last_name': '',
'company': '',
'phone': '',
'location': location,
'source_url': page_url,
'notes': ''
}
# Essayer de trouver des informations complémentaires
self._enhance_contact_info(contact, soup, text, structured_contacts, phones)
contacts.append(contact)
return contacts
def _extract_phone_numbers(self, text: str) -> List[str]:
"""
Extraire les numéros de téléphone
"""
phones = []
matches = self.phone_pattern.findall(text)
for phone in matches:
# Nettoyer le numéro
clean_phone = re.sub(r'[\s\-\.\/]', '', phone)
if len(clean_phone) >= 9: # Numéro valide
phones.append(phone)
return phones
def _extract_structured_contacts(self, soup: BeautifulSoup) -> List[Dict]:
"""
Extraire les contacts depuis les données structurées (microdata, JSON-LD, etc.)
"""
contacts = []
# Chercher les données JSON-LD
json_scripts = soup.find_all('script', type='application/ld+json')
for script in json_scripts:
try:
data = json.loads(script.string)
if isinstance(data, dict):
contact = self._parse_json_ld_contact(data)
if contact:
contacts.append(contact)
elif isinstance(data, list):
for item in data:
contact = self._parse_json_ld_contact(item)
if contact:
contacts.append(contact)
except:
continue
# Chercher les microdata
contacts.extend(self._extract_microdata_contacts(soup))
return contacts
def _parse_json_ld_contact(self, data: Dict) -> Dict:
"""
Parser un contact depuis les données JSON-LD
"""
contact = {}
if data.get('@type') in ['Organization', 'LocalBusiness', 'Person']:
contact['name'] = data.get('name', '')
contact['company'] = data.get('name', '') if data.get('@type') != 'Person' else ''
# Email
email = data.get('email')
if email:
contact['email'] = email
# Téléphone
phone = data.get('telephone')
if phone:
contact['phone'] = phone
# Adresse
address = data.get('address')
if address:
if isinstance(address, dict):
location_parts = []
if address.get('addressLocality'):
location_parts.append(address['addressLocality'])
if address.get('addressRegion'):
location_parts.append(address['addressRegion'])
if address.get('addressCountry'):
location_parts.append(address['addressCountry'])
contact['location'] = ', '.join(location_parts)
elif isinstance(address, str):
contact['location'] = address
return contact if contact.get('email') or contact.get('name') else None
def _extract_microdata_contacts(self, soup: BeautifulSoup) -> List[Dict]:
"""
Extraire les contacts depuis les microdata
"""
contacts = []
# Chercher les éléments avec itemtype Person ou Organization
items = soup.find_all(attrs={'itemtype': re.compile(r'.*(Person|Organization|LocalBusiness).*')})
for item in items:
contact = {}
# Nom
name_elem = item.find(attrs={'itemprop': 'name'})
if name_elem:
contact['name'] = name_elem.get_text(strip=True)
# Email
email_elem = item.find(attrs={'itemprop': 'email'})
if email_elem:
contact['email'] = email_elem.get('href', '').replace('mailto:', '') or email_elem.get_text(strip=True)
# Téléphone
phone_elem = item.find(attrs={'itemprop': 'telephone'})
if phone_elem:
contact['phone'] = phone_elem.get_text(strip=True)
if contact.get('email') or contact.get('name'):
contacts.append(contact)
return contacts
def _extract_location_info(self, soup: BeautifulSoup, text: str) -> str:
"""
Extraire les informations de localisation
"""
location_indicators = [
r'\b\d{4,5}\s+[A-Za-zÀ-ÿ\s\-]+\b', # Code postal + ville
r'\b[A-Za-zÀ-ÿ\s\-]+,\s*[A-Za-zÀ-ÿ\s\-]+\b', # Ville, Pays
]
# Chercher dans les balises d'adresse
address_tags = soup.find_all(['address', 'div'], class_=re.compile(r'.*address.*|.*location.*|.*contact.*'))
for tag in address_tags:
address_text = tag.get_text(strip=True)
for pattern in location_indicators:
match = re.search(pattern, address_text, re.IGNORECASE)
if match:
return match.group(0)
# Chercher dans le texte global
for pattern in location_indicators:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(0)
return ''
def _enhance_contact_info(self, contact: Dict, soup: BeautifulSoup, text: str, structured_contacts: List[Dict], phones: List[str]):
"""
Améliorer les informations de contact en croisant les données
"""
email = contact['email']
# Chercher dans les contacts structurés
for struct_contact in structured_contacts:
if struct_contact.get('email') == email:
contact.update(struct_contact)
break
# Si pas de nom trouvé, essayer d'extraire depuis l'email
if not contact['name']:
local_part = email.split('@')[0]
domain_part = email.split('@')[1]
# Essayer de deviner le nom depuis la partie locale
if '.' in local_part:
parts = local_part.split('.')
contact['first_name'] = parts[0].title()
contact['last_name'] = parts[1].title() if len(parts) > 1 else ''
contact['name'] = f"{contact['first_name']} {contact['last_name']}".strip()
else:
contact['name'] = local_part.title()
# Essayer de deviner l'entreprise depuis le domaine
if not contact['company']:
company_name = domain_part.split('.')[0]
contact['company'] = company_name.title()
# Ajouter un numéro de téléphone si disponible
if not contact['phone'] and phones:
contact['phone'] = phones[0] # Prendre le premier numéro trouvé
# Enrichir les notes avec des informations contextuelles
notes_parts = []
if contact['location']:
notes_parts.append(f"Localisation: {contact['location']}")
# Chercher des informations sur la fonction/titre
title_patterns = [
r'(?i)(?:directeur|manager|responsable|chef|président|ceo|cto|cfo)\s+[a-zA-ZÀ-ÿ\s]+',
r'(?i)[a-zA-ZÀ-ÿ\s]+\s+(?:director|manager|head|chief|president)'
]
for pattern in title_patterns:
matches = re.findall(pattern, text)
if matches:
notes_parts.append(f"Fonction possible: {matches[0]}")
break
contact['notes'] = ' | '.join(notes_parts)
def _merge_contact_info(self, existing: Dict, new: Dict):
"""
Fusionner les informations de deux contacts
"""
for key, value in new.items():
if value and not existing.get(key):
existing[key] = value
# Fusionner les notes
if new.get('notes') and existing.get('notes'):
existing['notes'] = f"{existing['notes']} | {new['notes']}"
elif new.get('notes'):
existing['notes'] = new['notes']
def _extract_domain_info(self, url: str, results: Dict):
"""
Extraire les informations générales du domaine
"""
domain = urlparse(url).netloc
results['domain_info'] = {
'domain': domain,
'company_guess': domain.split('.')[0].title(),
'total_contacts': len(results['contacts']),
'total_pages_scraped': len(results['pages_scraped'])
}
def _extract_emails_from_links(self, soup: BeautifulSoup) -> Set[str]:
"""
Extraire les emails des liens mailto
"""
emails = set()
# Chercher les liens mailto
mailto_links = soup.find_all('a', href=re.compile(r'^mailto:', re.I))
for link in mailto_links:
href = link.get('href', '')
email_match = re.search(r'mailto:([^?&]+)', href, re.I)
if email_match:
email = email_match.group(1)
if self._is_valid_email(email):
emails.add(email.lower())
return emails
def _extract_emails_from_text(self, text: str) -> Set[str]:
"""
Extraire les emails du texte de la page
"""
emails = set()
matches = self.email_pattern.findall(text)
for email in matches:
# Filtrer les emails indésirables
if not self._is_valid_email(email):
continue
emails.add(email.lower())
return emails
def _extract_internal_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
"""
Extraire les liens internes de la page
"""
links = []
base_domain = urlparse(base_url).netloc
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(base_url, href)
parsed_link = urlparse(full_url)
# Vérifier que c'est un lien interne et pas déjà visité
if (parsed_link.netloc == base_domain and
full_url not in self.visited_urls and
not self._is_excluded_link(full_url)):
links.append(full_url)
return links
def _is_valid_email(self, email: str) -> bool:
"""
Vérifier si l'email est valide et non indésirable
"""
# Filtrer les extensions de fichiers communes
excluded_extensions = ['.jpg', '.png', '.gif', '.pdf', '.doc', '.css', '.js']
for ext in excluded_extensions:
if email.lower().endswith(ext):
return False
# Filtrer les emails génériques indésirables
excluded_patterns = [
'example.com',
'test.com',
'placeholder',
'your-email',
'youremail',
'email@',
'noreply',
'no-reply'
]
for pattern in excluded_patterns:
if pattern in email.lower():
return False
# Vérifier la longueur
if len(email) < 5 or len(email) > 254:
return False
return True
def _is_excluded_link(self, url: str) -> bool:
"""
Vérifier si le lien doit être exclu du scraping
"""
excluded_patterns = [
'#',
'javascript:',
'tel:',
'mailto:',
'.pdf',
'.doc',
'.zip',
'.jpg',
'.png',
'.gif'
]
url_lower = url.lower()
for pattern in excluded_patterns:
if pattern in url_lower:
return True
return False
def save_results(self, results: Dict, filename: str = None) -> str:
"""
Sauvegarder les résultats dans un fichier JSON
"""
if not filename:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
domain = urlparse(results['url']).netloc.replace('.', '_')
filename = f"scraping_{domain}_{timestamp}.json"
# Créer le dossier s'il n'existe pas
scraping_folder = 'Data/email_scraping'
os.makedirs(scraping_folder, exist_ok=True)
filepath = os.path.join(scraping_folder, filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
return filepath
class EmailScrapingHistory:
def __init__(self):
self.history_folder = 'Data/email_scraping'
os.makedirs(self.history_folder, exist_ok=True)
def get_all_scrapings(self) -> List[Dict]:
"""
Récupérer l'historique de tous les scrapings
"""
scrapings = []
for filename in os.listdir(self.history_folder):
if filename.endswith('.json'):
filepath = os.path.join(self.history_folder, filename)
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
scrapings.append({
'filename': filename,
'url': data.get('url', ''),
'emails_count': len(data.get('contacts', data.get('emails', []))), # Support pour ancienne et nouvelle structure
'pages_count': len(data.get('pages_scraped', [])),
'start_time': data.get('start_time', ''),
'errors_count': len(data.get('errors', []))
})
except Exception as e:
print(f"Erreur lors de la lecture de {filename}: {e}")
# Trier par date (plus récent d'abord)
scrapings.sort(key=lambda x: x.get('start_time', ''), reverse=True)
return scrapings
def get_scraping_details(self, filename: str) -> Dict:
"""
Récupérer les détails d'un scraping spécifique
"""
filepath = os.path.join(self.history_folder, filename)
if os.path.exists(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
return None
def delete_scraping(self, filename: str) -> bool:
"""
Supprimer un fichier de scraping
"""
filepath = os.path.join(self.history_folder, filename)
if os.path.exists(filepath):
try:
os.remove(filepath)
return True
except Exception as e:
print(f"Erreur lors de la suppression: {e}")
return False
return False