gemini-resume-checker/app.py

import os
import re
import asyncio
import secrets
import uuid
from flask import Flask, render_template, request, jsonify
from werkzeug.utils import secure_filename
from flask_wtf.csrf import CSRFProtect
from pdf2image import convert_from_path
from PIL import Image
import fitz  # PyMuPDF for PDF validation
from flask_cors import CORS
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
import google.generativeai as genai
import requests
from dotenv import load_dotenv
import markdown

# Load environment variables
load_dotenv()

# Configure Gemini API
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY environment variable is required but not set")
genai.configure(api_key=GOOGLE_API_KEY)
model_img = genai.GenerativeModel('gemini-2.0-flash-lite-preview')

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = os.getenv("UPLOAD_FOLDER", 'uploads/')
app.config['ALLOWED_EXTENSIONS'] = "pdf"
app.config['MAX_FILE_SIZE'] = int(os.getenv("MAX_FILE_SIZE", 4 * 1024 * 1024))  # 4MB limit
app.config['SECRET_KEY'] = os.getenv(os.getenv("SECRET_KEY"), secrets.token_hex(32))

MAX_TEXT_LENGTH = 2000

RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY", "")
RECAPTCHA_SITE_KEY = os.getenv("RECAPTCHA_SITE_KEY", "")

semaphore = asyncio.Semaphore(5)  # Limit to 5 concurrent tasks

# Initialize Flask-Limiter
limiter = Limiter(
    get_remote_address,  # Limits requests based on client IP
    app=app,
    default_limits=[os.getenv("RATE_LIMIT", "1 per 10 seconds")])


# Custom function to reject requests with 444 status code
@app.errorhandler(429)
def rate_limit_exceeded(e):
    return "", 444  # Nginx-style "No Response" error

# CSRF Protection
csrf = CSRFProtect(app)

# CORS with strict policy
CORS(app, resources={r"/*": {"origins": os.getenv("CORS_ORIGIN", "")}})

if not os.path.exists(app.config['UPLOAD_FOLDER']):
    os.makedirs(app.config['UPLOAD_FOLDER'])

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']

def sanitize_text(text):
    sanitized_text = re.sub(r'[^a-zA-Z0-9 .,!?\n\r]', '', text)
    return sanitized_text[:MAX_TEXT_LENGTH]

import fitz  # PyMuPDF

import fitz  # PyMuPDF

def is_valid_pdf(pdf_path):
    try:
        # Check if the file starts with "%PDF" magic bytes
        with open(pdf_path, "rb") as f:
            if not f.read(4) == b"%PDF":
                print("Error: File is not a valid PDF.")
                return False

        doc = fitz.open(pdf_path)

        for page in doc:
            # Check for JavaScript in annotations
            for ann in page.annots() or []:
                if ann.info.get("JS") or ann.info.get("AA"):  # JavaScript actions
                    print("Error: PDF contains JavaScript.")
                    return False

            # Check for form fields (can contain scripts)
            '''
            if page.widgets():
                print("Error: PDF contains form fields (potentially unsafe).")
                return False
            '''

            # Check for embedded files (can contain malicious content)
            for link in page.get_links():
                if link.get("kind") == 2:  # Allow external links
                    continue
                elif link.get("kind") in [5, 6]:  # Embedded file or launch action
                    print("Error: PDF contains embedded files or launch actions.")
                    return False

        print("✅ PDF is valid and safe (contains no JavaScript or harmful elements).")
        return True

    except Exception as e:
        print(f"Error processing PDF: {e}")
        return False

def format_summary(summary):
    return markdown.markdown(summary)

def resize_image(image, max_size=1080):
    width, height = image.size
    if width > max_size or height > max_size:
        scaling_factor = max_size / max(width, height)
        new_size = (int(width * scaling_factor), int(height * scaling_factor))
        return image.resize(new_size, Image.Resampling.LANCZOS)
    return image

def stitch_images(image_paths):
    """Stitches multiple images into a single vertical image."""
    images = [Image.open(img_path) for img_path in image_paths]

    # Get total height & max width
    max_width = max(img.width for img in images)
    total_height = sum(img.height for img in images)

    # Create a blank image with the combined height
    stitched_image = Image.new("RGB", (max_width, total_height))

    # Paste images sequentially
    y_offset = 0
    for img in images:
        stitched_image.paste(img, (0, y_offset))
        y_offset += img.height

    # Resize if needed
    stitched_image = resize_image(stitched_image, max_size=1080)

    return stitched_image


def pdf_to_images(pdf_path, max_pages=2):
    """Converts PDF to images and stitches them into one."""
    images = convert_from_path(pdf_path, first_page=1, last_page=max_pages)

    image_paths = []
    for i, image in enumerate(images):
        image_path = f"{pdf_path}_page_{i+1}.png"
        image.save(image_path, "PNG")
        image_paths.append(image_path)

    # Stitch images into one
    stitched_image = stitch_images(image_paths)

    # Cleanup individual images
    for img_path in image_paths:
        os.remove(img_path)

    return stitched_image

async def summarize_pdf(pdf_file, jobdesc_text):
    async with semaphore:  # Limit concurrency to 5
        jobdesc_text = sanitize_text(jobdesc_text)

        jobdesc_text += "\nHow compatible my CV is with this jobdesc? In a scale of 100 how would you score it?"

        # Convert PDF to image asynchronously
        image = await asyncio.to_thread(pdf_to_images, pdf_file, 2)

        # Run AI inference in a separate thread to avoid blocking
        summary = await asyncio.to_thread(model_img.generate_content, [jobdesc_text, image])

        os.remove(pdf_file)  # Delete PDF after processing

        formatted_summary = format_summary(summary.text)
        return formatted_summary

@app.after_request
def add_security_headers(response):
    response.headers['X-Frame-Options'] = 'DENY'
    response.headers['X-Content-Type-Options'] = 'nosniff'
    response.headers['Referrer-Policy'] = 'strict-origin-when-cross-origin'
    response.headers['Content-Security-Policy'] = (
        "default-src 'self'; "
        "script-src 'self' https://www.google.com https://www.gstatic.com https://code.jquery.com https://cdn.jsdelivr.net https://www.recaptcha.net; "
        "style-src 'self' https://cdn.jsdelivr.net; "
        "img-src 'self' data: https://www.google.com https://www.gstatic.com; "
        "frame-src 'self' https://www.google.com https://www.recaptcha.net; "
        "connect-src 'self' https://www.google.com https://www.gstatic.com https://www.recaptcha.net; "
        "object-src 'none'; "
    )
    return response

@app.route('/', methods=['GET'])
def upload_page():
    return render_template('upload.html', recaptcha_site_key=RECAPTCHA_SITE_KEY)

@app.route('/', methods=['POST'])
@limiter.limit("1 per 10 seconds")
async def process_upload():
    recaptcha_response = request.form.get('g-recaptcha-response')
    recaptcha_verification = requests.post(
        "https://www.google.com/recaptcha/api/siteverify",
        data={"secret": RECAPTCHA_SECRET_KEY, "response": recaptcha_response}
    ).json()
    if not recaptcha_verification.get("success"):
        return jsonify({"error": "reCAPTCHA verification failed."}), 400

    text_input = sanitize_text(request.form.get('text_input', '').strip())
    file = request.files.get('file')

    if not text_input and not file:
        return jsonify({"error": "Job description and PDF file are required."}), 400

    if file and allowed_file(file.filename):
        if file.content_length and file.content_length > app.config['MAX_FILE_SIZE']:
            return jsonify({"error": "File size exceeds 4MB limit."}), 400

        filename = f"{uuid.uuid4().hex}.pdf"
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)

        if not is_valid_pdf(filepath):
            os.remove(filepath)
            return jsonify({"error": "Invalid or potentially harmful PDF."}), 400

        summary = await summarize_pdf(filepath, text_input)
        score_match = re.search(r"(\d{1,3})\s*(?:\/|out of)\s*100", summary, re.IGNORECASE)
        score = int(score_match.group(1)) if score_match else None
        return jsonify({"summary": summary, "score": score})

    return jsonify({"error": "Invalid file format."}), 400

if __name__ == '__main__':
    app.run(host=os.getenv("FLASK_RUN_HOST", "0.0.0.0"), port=int(os.getenv("FLASK_RUN_PORT", 49465)), debug=False)