import os import re import asyncio import secrets import uuid from flask import Flask, render_template, request, jsonify from werkzeug.utils import secure_filename from flask_wtf.csrf import CSRFProtect from pdf2image import convert_from_path from PIL import Image import fitz # PyMuPDF for PDF validation from flask_cors import CORS from flask_limiter import Limiter from flask_limiter.util import get_remote_address import google.generativeai as genai import requests from dotenv import load_dotenv # Load environment variables load_dotenv() # Configure Gemini API GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") if not GOOGLE_API_KEY: raise ValueError("GOOGLE_API_KEY environment variable is required but not set") genai.configure(api_key=GOOGLE_API_KEY) model_img = genai.GenerativeModel('gemini-2.0-flash-lite-preview') app = Flask(__name__) app.config['UPLOAD_FOLDER'] = os.getenv("UPLOAD_FOLDER", 'uploads/') app.config['ALLOWED_EXTENSIONS'] = "pdf" app.config['MAX_FILE_SIZE'] = int(os.getenv("MAX_FILE_SIZE", 4 * 1024 * 1024)) # 4MB limit app.config['SECRET_KEY'] = os.getenv(os.getenv("SECRET_KEY"), secrets.token_hex(32)) MAX_TEXT_LENGTH = 2000 RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY", "") RECAPTCHA_SITE_KEY = os.getenv("RECAPTCHA_SITE_KEY", "") semaphore = asyncio.Semaphore(5) # Limit to 5 concurrent tasks # Initialize Flask-Limiter limiter = Limiter( get_remote_address, # Limits requests based on client IP app=app, default_limits=[os.getenv("RATE_LIMIT", "1 per 10 seconds")]) # Custom function to reject requests with 444 status code @app.errorhandler(429) def rate_limit_exceeded(e): return "", 444 # Nginx-style "No Response" error # CSRF Protection csrf = CSRFProtect(app) # CORS with strict policy CORS(app, resources={r"/*": {"origins": os.getenv("CORS_ORIGIN", "")}}) if not os.path.exists(app.config['UPLOAD_FOLDER']): os.makedirs(app.config['UPLOAD_FOLDER']) def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS'] def sanitize_text(text): sanitized_text = re.sub(r'[^a-zA-Z0-9 .,!?\n\r]', '', text) return sanitized_text[:MAX_TEXT_LENGTH] import fitz # PyMuPDF import fitz # PyMuPDF def is_valid_pdf(pdf_path): try: # Check if the file starts with "%PDF" magic bytes with open(pdf_path, "rb") as f: if not f.read(4) == b"%PDF": print("Error: File is not a valid PDF.") return False doc = fitz.open(pdf_path) for page in doc: # Check for JavaScript in annotations for ann in page.annots() or []: if ann.info.get("JS") or ann.info.get("AA"): # JavaScript actions print("Error: PDF contains JavaScript.") return False # Check for form fields (can contain scripts) ''' if page.widgets(): print("Error: PDF contains form fields (potentially unsafe).") return False ''' # Check for embedded files (can contain malicious content) for link in page.get_links(): if link.get("kind") == 2: # Allow external links continue elif link.get("kind") in [5, 6]: # Embedded file or launch action print("Error: PDF contains embedded files or launch actions.") return False print("✅ PDF is valid and safe (contains no JavaScript or harmful elements).") return True except Exception as e: print(f"Error processing PDF: {e}") return False def format_summary(summary): """Converts markdown-like text to HTML formatting, including nested lists (both ordered and unordered).""" summary = re.sub(r'\*\*(.*?)\*\*', r'\1', summary) # Bold text summary = re.sub(r'\*(?!\s)(.*?)\*', r'\1', summary) # Italic text (ignoring lists) lines = summary.split('\n') formatted_lines = [] list_stack = [] # Track nesting levels and types for line in lines: unordered_match = re.match(r'^(\s*)\*\s(.+)', line) # Matches "* item" ordered_match = re.match(r'^(\s*)(\d+)\.\s(.+)', line) # Matches "1. item" if unordered_match or ordered_match: indent = unordered_match.group(1) if unordered_match else ordered_match.group(1) level = len(indent) // 4 # Assume 4 spaces per indent level list_type = '