From c867c9e730bfadca6f7eaa3b99efff4ae36a9c3b Mon Sep 17 00:00:00 2001 From: gabrielkheisa Date: Wed, 12 Feb 2025 01:05:18 +0700 Subject: [PATCH] first --- .gitignore | 50 ++++++++ Dockerfile | 26 ++++ app.py | 287 ++++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 26 ++++ env.example | 22 ++++ requirements.txt | 11 ++ static/css/style.css | 19 +++ static/js/main.js | 116 +++++++++++++++++ templates/upload.html | 47 +++++++ 9 files changed, 604 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 app.py create mode 100644 docker-compose.yml create mode 100644 env.example create mode 100644 requirements.txt create mode 100644 static/css/style.css create mode 100644 static/js/main.js create mode 100644 templates/upload.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c8fdc92 --- /dev/null +++ b/.gitignore @@ -0,0 +1,50 @@ +app-dev.py +check_pip_modules_version.py + +# Byte-compiled / cached files +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environment +venv/ +env/ +*.egg-info/ +pip-log.txt +pip-delete-this-directory.txt + +# Distribution / packaging +build/ +dist/ +*.egg +*.egg-info/ +.eggs/ + +# Logs and databases +*.log +*.sqlite3 +*.db + +# Jupyter Notebook checkpoints +.ipynb_checkpoints/ + +# Pytest and coverage reports +.coverage +.tox/ +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# MyPy and type checking +.mypy_cache/ + +# IDE files +.vscode/ +.idea/ +*.iml + +# Docker and deployment +*.dockerfile +.env + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d4cfefb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.9-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application +COPY . . + +# Set environment variables +ENV FLASK_APP=app.py +ENV FLASK_ENV=development +ENV PYTHONUNBUFFERED=1 + +# Expose the port the app runs on +EXPOSE 5000 + +# Command to run the application +CMD ["flask", "run", "--host=0.0.0.0"] \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..c00af11 --- /dev/null +++ b/app.py @@ -0,0 +1,287 @@ +import os +import re +import asyncio +import secrets +import uuid +from flask import Flask, render_template, request, jsonify +from werkzeug.utils import secure_filename +from flask_wtf.csrf import CSRFProtect +from pdf2image import convert_from_path +from PIL import Image +import fitz # PyMuPDF for PDF validation +from flask_cors import CORS +from flask_limiter import Limiter +from flask_limiter.util import get_remote_address +import google.generativeai as genai +import requests +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Configure Gemini API +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") +if not GOOGLE_API_KEY: + raise ValueError("GOOGLE_API_KEY environment variable is required but not set") +genai.configure(api_key=GOOGLE_API_KEY) +model_img = genai.GenerativeModel('gemini-2.0-flash-lite-preview') + +app = Flask(__name__) +app.config['UPLOAD_FOLDER'] = os.getenv("UPLOAD_FOLDER", 'uploads/') +app.config['ALLOWED_EXTENSIONS'] = "pdf" +app.config['MAX_FILE_SIZE'] = int(os.getenv("MAX_FILE_SIZE", 4 * 1024 * 1024)) # 4MB limit +app.config['SECRET_KEY'] = os.getenv(os.getenv("SECRET_KEY"), secrets.token_hex(32)) + +MAX_TEXT_LENGTH = 2000 + +RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY", "") +RECAPTCHA_SITE_KEY = os.getenv("RECAPTCHA_SITE_KEY", "") + +semaphore = asyncio.Semaphore(5) # Limit to 5 concurrent tasks + +# Initialize Flask-Limiter +limiter = Limiter( + get_remote_address, # Limits requests based on client IP + app=app, + default_limits=[os.getenv("RATE_LIMIT", "1 per 10 seconds")]) + + +# Custom function to reject requests with 444 status code +@app.errorhandler(429) +def rate_limit_exceeded(e): + return "", 444 # Nginx-style "No Response" error + +# CSRF Protection +csrf = CSRFProtect(app) + +# CORS with strict policy +CORS(app, resources={r"/*": {"origins": os.getenv("CORS_ORIGIN", "")}}) + +if not os.path.exists(app.config['UPLOAD_FOLDER']): + os.makedirs(app.config['UPLOAD_FOLDER']) + +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS'] + +def sanitize_text(text): + sanitized_text = re.sub(r'[^a-zA-Z0-9 .,!?\n\r]', '', text) + return sanitized_text[:MAX_TEXT_LENGTH] + +import fitz # PyMuPDF + +import fitz # PyMuPDF + +def is_valid_pdf(pdf_path): + try: + # Check if the file starts with "%PDF" magic bytes + with open(pdf_path, "rb") as f: + if not f.read(4) == b"%PDF": + print("Error: File is not a valid PDF.") + return False + + doc = fitz.open(pdf_path) + + for page in doc: + # Check for JavaScript in annotations + for ann in page.annots() or []: + if ann.info.get("JS") or ann.info.get("AA"): # JavaScript actions + print("Error: PDF contains JavaScript.") + return False + + # Check for form fields (can contain scripts) + ''' + if page.widgets(): + print("Error: PDF contains form fields (potentially unsafe).") + return False + ''' + + # Check for embedded files (can contain malicious content) + for link in page.get_links(): + if link.get("kind") == 2: # Allow external links + continue + elif link.get("kind") in [5, 6]: # Embedded file or launch action + print("Error: PDF contains embedded files or launch actions.") + return False + + print("✅ PDF is valid and safe (contains no JavaScript or harmful elements).") + return True + + except Exception as e: + print(f"Error processing PDF: {e}") + return False + +def format_summary(summary): + """Converts markdown-like text to HTML formatting, including nested lists (both ordered and unordered).""" + summary = re.sub(r'\*\*(.*?)\*\*', r'\1', summary) # Bold text + summary = re.sub(r'\*(?!\s)(.*?)\*', r'\1', summary) # Italic text (ignoring lists) + + lines = summary.split('\n') + formatted_lines = [] + list_stack = [] # Track nesting levels and types + + for line in lines: + unordered_match = re.match(r'^(\s*)\*\s(.+)', line) # Matches "* item" + ordered_match = re.match(r'^(\s*)(\d+)\.\s(.+)', line) # Matches "1. item" + + if unordered_match or ordered_match: + indent = unordered_match.group(1) if unordered_match else ordered_match.group(1) + level = len(indent) // 4 # Assume 4 spaces per indent level + + list_type = '