This commit is contained in:
Gabriel Kheisa 2025-02-12 01:05:18 +07:00
commit c867c9e730
9 changed files with 604 additions and 0 deletions

50
.gitignore vendored Normal file
View File

@ -0,0 +1,50 @@
app-dev.py
check_pip_modules_version.py
# Byte-compiled / cached files
__pycache__/
*.py[cod]
*$py.class
# Virtual environment
venv/
env/
*.egg-info/
pip-log.txt
pip-delete-this-directory.txt
# Distribution / packaging
build/
dist/
*.egg
*.egg-info/
.eggs/
# Logs and databases
*.log
*.sqlite3
*.db
# Jupyter Notebook checkpoints
.ipynb_checkpoints/
# Pytest and coverage reports
.coverage
.tox/
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# MyPy and type checking
.mypy_cache/
# IDE files
.vscode/
.idea/
*.iml
# Docker and deployment
*.dockerfile
.env

26
Dockerfile Normal file
View File

@ -0,0 +1,26 @@
FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application
COPY . .
# Set environment variables
ENV FLASK_APP=app.py
ENV FLASK_ENV=development
ENV PYTHONUNBUFFERED=1
# Expose the port the app runs on
EXPOSE 5000
# Command to run the application
CMD ["flask", "run", "--host=0.0.0.0"]

287
app.py Normal file
View File

@ -0,0 +1,287 @@
import os
import re
import asyncio
import secrets
import uuid
from flask import Flask, render_template, request, jsonify
from werkzeug.utils import secure_filename
from flask_wtf.csrf import CSRFProtect
from pdf2image import convert_from_path
from PIL import Image
import fitz # PyMuPDF for PDF validation
from flask_cors import CORS
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
import google.generativeai as genai
import requests
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configure Gemini API
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
if not GOOGLE_API_KEY:
raise ValueError("GOOGLE_API_KEY environment variable is required but not set")
genai.configure(api_key=GOOGLE_API_KEY)
model_img = genai.GenerativeModel('gemini-2.0-flash-lite-preview')
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = os.getenv("UPLOAD_FOLDER", 'uploads/')
app.config['ALLOWED_EXTENSIONS'] = "pdf"
app.config['MAX_FILE_SIZE'] = int(os.getenv("MAX_FILE_SIZE", 4 * 1024 * 1024)) # 4MB limit
app.config['SECRET_KEY'] = os.getenv(os.getenv("SECRET_KEY"), secrets.token_hex(32))
MAX_TEXT_LENGTH = 2000
RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY", "")
RECAPTCHA_SITE_KEY = os.getenv("RECAPTCHA_SITE_KEY", "")
semaphore = asyncio.Semaphore(5) # Limit to 5 concurrent tasks
# Initialize Flask-Limiter
limiter = Limiter(
get_remote_address, # Limits requests based on client IP
app=app,
default_limits=[os.getenv("RATE_LIMIT", "1 per 10 seconds")])
# Custom function to reject requests with 444 status code
@app.errorhandler(429)
def rate_limit_exceeded(e):
return "", 444 # Nginx-style "No Response" error
# CSRF Protection
csrf = CSRFProtect(app)
# CORS with strict policy
CORS(app, resources={r"/*": {"origins": os.getenv("CORS_ORIGIN", "")}})
if not os.path.exists(app.config['UPLOAD_FOLDER']):
os.makedirs(app.config['UPLOAD_FOLDER'])
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
def sanitize_text(text):
sanitized_text = re.sub(r'[^a-zA-Z0-9 .,!?\n\r]', '', text)
return sanitized_text[:MAX_TEXT_LENGTH]
import fitz # PyMuPDF
import fitz # PyMuPDF
def is_valid_pdf(pdf_path):
try:
# Check if the file starts with "%PDF" magic bytes
with open(pdf_path, "rb") as f:
if not f.read(4) == b"%PDF":
print("Error: File is not a valid PDF.")
return False
doc = fitz.open(pdf_path)
for page in doc:
# Check for JavaScript in annotations
for ann in page.annots() or []:
if ann.info.get("JS") or ann.info.get("AA"): # JavaScript actions
print("Error: PDF contains JavaScript.")
return False
# Check for form fields (can contain scripts)
'''
if page.widgets():
print("Error: PDF contains form fields (potentially unsafe).")
return False
'''
# Check for embedded files (can contain malicious content)
for link in page.get_links():
if link.get("kind") == 2: # Allow external links
continue
elif link.get("kind") in [5, 6]: # Embedded file or launch action
print("Error: PDF contains embedded files or launch actions.")
return False
print("✅ PDF is valid and safe (contains no JavaScript or harmful elements).")
return True
except Exception as e:
print(f"Error processing PDF: {e}")
return False
def format_summary(summary):
"""Converts markdown-like text to HTML formatting, including nested lists (both ordered and unordered)."""
summary = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', summary) # Bold text
summary = re.sub(r'\*(?!\s)(.*?)\*', r'<em>\1</em>', summary) # Italic text (ignoring lists)
lines = summary.split('\n')
formatted_lines = []
list_stack = [] # Track nesting levels and types
for line in lines:
unordered_match = re.match(r'^(\s*)\*\s(.+)', line) # Matches "* item"
ordered_match = re.match(r'^(\s*)(\d+)\.\s(.+)', line) # Matches "1. item"
if unordered_match or ordered_match:
indent = unordered_match.group(1) if unordered_match else ordered_match.group(1)
level = len(indent) // 4 # Assume 4 spaces per indent level
list_type = '<ul>' if unordered_match else '<ol>'
list_tag = '<li>{}</li>'.format(unordered_match.group(2) if unordered_match else ordered_match.group(3))
# Close lists if necessary
while list_stack and len(list_stack) > level:
formatted_lines.append('</{}>'.format(list_stack.pop()))
# Open new lists if necessary
while len(list_stack) < level:
formatted_lines.append(list_type)
list_stack.append(list_type[1:3]) # Store 'ul' or 'ol'
# Handle list type switching (unordered ↔ ordered)
if list_stack and list_stack[-1] != list_type[1:3]:
formatted_lines.append('</{}>'.format(list_stack.pop()))
formatted_lines.append(list_type)
list_stack.append(list_type[1:3])
formatted_lines.append(list_tag)
else:
# Close any open lists before adding non-list content
while list_stack:
formatted_lines.append('</{}>'.format(list_stack.pop()))
formatted_lines.append(line.replace("\n", "<br>"))
# Close any remaining lists
while list_stack:
formatted_lines.append('</{}>'.format(list_stack.pop()))
return '\n'.join(formatted_lines)
def resize_image(image, max_size=1080):
width, height = image.size
if width > max_size or height > max_size:
scaling_factor = max_size / max(width, height)
new_size = (int(width * scaling_factor), int(height * scaling_factor))
return image.resize(new_size, Image.Resampling.LANCZOS)
return image
def stitch_images(image_paths):
"""Stitches multiple images into a single vertical image."""
images = [Image.open(img_path) for img_path in image_paths]
# Get total height & max width
max_width = max(img.width for img in images)
total_height = sum(img.height for img in images)
# Create a blank image with the combined height
stitched_image = Image.new("RGB", (max_width, total_height))
# Paste images sequentially
y_offset = 0
for img in images:
stitched_image.paste(img, (0, y_offset))
y_offset += img.height
# Resize if needed
stitched_image = resize_image(stitched_image, max_size=1080)
return stitched_image
def pdf_to_images(pdf_path, max_pages=2):
"""Converts PDF to images and stitches them into one."""
images = convert_from_path(pdf_path, first_page=1, last_page=max_pages)
image_paths = []
for i, image in enumerate(images):
image_path = f"{pdf_path}_page_{i+1}.png"
image.save(image_path, "PNG")
image_paths.append(image_path)
# Stitch images into one
stitched_image = stitch_images(image_paths)
# Cleanup individual images
for img_path in image_paths:
os.remove(img_path)
return stitched_image
async def summarize_pdf(pdf_file, jobdesc_text):
async with semaphore: # Limit concurrency to 5
jobdesc_text = sanitize_text(jobdesc_text)
jobdesc_text += "\nHow compatible my CV is with this jobdesc? In a scale of 100 how would you score it?"
# Convert PDF to image asynchronously
image = await asyncio.to_thread(pdf_to_images, pdf_file, 2)
# Run AI inference in a separate thread to avoid blocking
summary = await asyncio.to_thread(model_img.generate_content, [jobdesc_text, image])
os.remove(pdf_file) # Delete PDF after processing
formatted_summary = format_summary(summary.text)
return formatted_summary
@app.after_request
def add_security_headers(response):
response.headers['X-Frame-Options'] = 'DENY'
response.headers['X-Content-Type-Options'] = 'nosniff'
response.headers['Referrer-Policy'] = 'strict-origin-when-cross-origin'
response.headers['Content-Security-Policy'] = (
"default-src 'self'; "
"script-src 'self' https://www.google.com https://www.gstatic.com https://code.jquery.com https://cdn.jsdelivr.net https://www.recaptcha.net; "
"style-src 'self' https://cdn.jsdelivr.net; "
"img-src 'self' data: https://www.google.com https://www.gstatic.com; "
"frame-src 'self' https://www.google.com https://www.recaptcha.net; "
"connect-src 'self' https://www.google.com https://www.gstatic.com https://www.recaptcha.net; "
"object-src 'none'; "
)
return response
@app.route('/', methods=['GET'])
def upload_page():
return render_template('upload.html', recaptcha_site_key=RECAPTCHA_SITE_KEY)
@app.route('/', methods=['POST'])
@limiter.limit("1 per 10 seconds")
async def process_upload():
recaptcha_response = request.form.get('g-recaptcha-response')
recaptcha_verification = requests.post(
"https://www.google.com/recaptcha/api/siteverify",
data={"secret": RECAPTCHA_SECRET_KEY, "response": recaptcha_response}
).json()
if not recaptcha_verification.get("success"):
return jsonify({"error": "reCAPTCHA verification failed."}), 400
text_input = sanitize_text(request.form.get('text_input', '').strip())
file = request.files.get('file')
if not text_input and not file:
return jsonify({"error": "Job description and PDF file are required."}), 400
if file and allowed_file(file.filename):
if file.content_length and file.content_length > app.config['MAX_FILE_SIZE']:
return jsonify({"error": "File size exceeds 4MB limit."}), 400
filename = f"{uuid.uuid4().hex}.pdf"
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
if not is_valid_pdf(filepath):
os.remove(filepath)
return jsonify({"error": "Invalid or potentially harmful PDF."}), 400
summary = await summarize_pdf(filepath, text_input)
score_match = re.search(r"(\d{1,3})\s*(?:\/|out of)\s*100", summary, re.IGNORECASE)
score = int(score_match.group(1)) if score_match else None
return jsonify({"summary": summary, "score": score})
return jsonify({"error": "Invalid file format."}), 400
if __name__ == '__main__':
app.run(host=os.getenv("FLASK_RUN_HOST", "0.0.0.0"), port=int(os.getenv("FLASK_RUN_PORT", 49465)), debug=False)

26
docker-compose.yml Normal file
View File

@ -0,0 +1,26 @@
FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application
COPY . .
# Set environment variables
ENV FLASK_APP=app.py
ENV FLASK_ENV=development
ENV PYTHONUNBUFFERED=1
# Expose the port the app runs on
EXPOSE 5000
# Command to run the application
CMD ["flask", "run", "--host=0.0.0.0"]

22
env.example Normal file
View File

@ -0,0 +1,22 @@
# Flask Configuration
FLASK_RUN_HOST=
FLASK_RUN_PORT=
SECRET_KEY=
# Upload Settings
UPLOAD_FOLDER=uploads/
MAX_FILE_SIZE=4200000
# Google AI API
GOOGLE_API_KEY=
# reCAPTCHA Keys
RECAPTCHA_SECRET_KEY=a
RECAPTCHA_SITE_KEY=_
# Rate Limiting
RATE_LIMIT=1 per 10 seconds
# CORS Configuration (Set to your domain or leave empty for strict security)
CORS_ORIGIN=

11
requirements.txt Normal file
View File

@ -0,0 +1,11 @@
flask==3.1.0
werkzeug==3.1.3
pdf2image==1.17.0
Pillow==11.0.0
PyMuPDF==1.25.3
flask-cors==5.0.0
flask-wtf==1.2.2
flask-limiter==3.10.1
google-generativeai==0.8.3
requests==2.31.0
python-dotenv==1.0.1

19
static/css/style.css Normal file
View File

@ -0,0 +1,19 @@
.loading-spinner {
display: none;
text-align: center;
margin-top: 20px;
}
.score-container {
display: flex;
align-items: center;
gap: 10px;
margin-top: 20px;
}
.score-container canvas {
width: 80px !important;
height: 80px !important;
}
.score-text {
font-size: 1.2rem;
font-weight: bold;
}

116
static/js/main.js Normal file
View File

@ -0,0 +1,116 @@
$(document).ready(function () {
$('#upload-form').submit(function (event) {
event.preventDefault();
$('#error-message').text('');
$('.loading-spinner').show();
var response = grecaptcha.getResponse();
if (!response) {
$('.loading-spinner').hide();
alert("Please complete the reCAPTCHA verification.");
return;
}
var formData = new FormData(this);
formData.append('csrf_token', $('#csrf_token').val());
$.ajax({
url: '/',
type: 'POST',
data: formData,
contentType: false,
processData: false,
beforeSend: function (xhr) {
xhr.setRequestHeader('X-CSRFToken', $('#csrf_token').val());
},
success: function (response) {
$('.loading-spinner').hide();
if (response.error) {
$('#error-message').text(response.error);
} else {
let scoreSectionHtml = '';
let resultSectionHtml = '';
if (response.score !== null) {
scoreSectionHtml = `
<div class="score-container" id="score-section">
<canvas id="scoreChart"></canvas>
<div>
<p class="score-text">Your CV compatibility score is <span id="score-text">${response.score}</span>/100</p>
</div>
</div>`;
}
if (response.summary) {
resultSectionHtml = `
<div class="mt-4" id="result-section">
<h4>Summary:</h4>
<div id="summary-text">${response.summary}</div>
<button class="btn btn-secondary" id="clear-form">Upload Another</button>
</div>`;
}
// Remove previous sections before inserting new ones
$('#score-section, #result-section').remove();
// Append score section first, then summary section
if (scoreSectionHtml) {
$('#upload-form').after(scoreSectionHtml);
updateScoreChart(response.score);
}
if (resultSectionHtml) {
$('#score-section').after(resultSectionHtml); // Ensure summary is BELOW score
}
// Attach event listener to new "Upload Another" button
$('#clear-form').click(function () {
$('#score-section, #result-section').remove();
$('#upload-form')[0].reset();
grecaptcha.reset(); // Reset reCAPTCHA
});
}
},
error: function (xhr) {
$('.loading-spinner').hide();
let errorMessage = 'An error occurred. Please try again.';
if (xhr.responseJSON && xhr.responseJSON.error) {
errorMessage = xhr.responseJSON.error;
}
$('#error-message').text(errorMessage);
}
});
});
});
function updateScoreChart(score) {
let ctx = document.getElementById('scoreChart').getContext('2d');
if (window.scoreChartInstance) {
window.scoreChartInstance.destroy();
}
// Set color based on score
let scoreColor = "#4caf50"; // Green (default)
if (score < 75) scoreColor = "#ffeb3b"; // Yellow (below 75)
if (score < 50) scoreColor = "#f44336"; // Red (below 50)
window.scoreChartInstance = new Chart(ctx, {
type: 'doughnut',
data: {
labels: ["Score", "Remaining"],
datasets: [{
data: [score, 100 - score],
backgroundColor: [scoreColor, "#e0e0e0"],
borderWidth: 1
}]
},
options: {
responsive: false,
cutout: '60%',
plugins: {
legend: { display: false }
}
}
});
}

47
templates/upload.html Normal file
View File

@ -0,0 +1,47 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>CV Reader & Job Match</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> <!-- Chart.js for Pie Chart -->
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body class="bg-light">
<div class="container py-5">
<h2 class="text-center mb-4">CV Reader & Job Description Compatibility</h2>
<div class="card shadow p-4">
<form id="upload-form" method="post" enctype="multipart/form-data">
<input type="hidden" id="csrf_token" value="{{ csrf_token() }}">
<div class="mb-3">
<label for="file" class="form-label">Upload Your CV (PDF)</label>
<input class="form-control" type="file" name="file" id="file" accept="application/pdf">
</div>
<div class="mb-3">
<label for="text_input" class="form-label">Job Description</label>
<textarea class="form-control" name="text_input" id="text_input" rows="3" placeholder="Paste the job description here..." maxlength="2000"></textarea>
<!-- Google reCAPTCHA Widget -->
<div class="mt-3">
<div class="g-recaptcha" data-sitekey="{{ recaptcha_site_key }}"></div>
</div>
</div>
<div class="mb-3 text-danger" id="error-message"></div>
<button type="submit" class="btn btn-primary w-100">Process</button>
</form>
<div class="loading-spinner">
<div class="spinner-border text-primary" role="status"></div>
<p>Processing... Please wait.</p>
</div>
</div>
</div>
<script src="{{ url_for('static', filename='js/main.js') }}"></script>
</body>
</html>