Compare commits

..

2 Commits

Author SHA1 Message Date
gabrielkheisa
43e540d7b0 Deployment using Docker 2024-04-14 14:39:20 +07:00
gabrielkheisa
1410f1c3d8 remove parallelism 2024-04-14 13:17:07 +07:00
3 changed files with 90 additions and 30 deletions

28
Dockerfile Normal file
View File

@@ -0,0 +1,28 @@
# Use the official Selenium standalone Chrome image as base
FROM selenium/standalone-chrome:112.0.5615.165-chromedriver-112.0.5615.49
ENV DEBIAN_FRONTEND=noninteractive
# Update package lists
RUN sudo apt-get update
# Install Python 3.6 and pip
RUN sudo apt-get install -y python3
RUN sudo apt-get install -y python3-pip
# Install Selenium and any other Python dependencies you may need
RUN sudo pip install selenium==3.141.0
RUN sudo pip install --upgrade urllib3==1.26.16
RUN sudo pip install Flask
# Set the working directory
WORKDIR /usr/src/app
# Copy your Python scripts into the container
COPY . .
EXPOSE 8080
# Example command to run your Python script
CMD ["python3", "run.py"]

View File

@@ -66,10 +66,24 @@ This micro web server does not directly download the Instagram Reels video. It s
git clone https://github.com/gabrielkheisa/instagram-downloader.git git clone https://github.com/gabrielkheisa/instagram-downloader.git
``` ```
### With Dockerfile
1. Build the Dockerfile
```
sudo docker build -t instagram-downloader .
```
2. Run the container
```
sudo docker run -d -p 8080:8080 instagram-downloader
```
## Usage ## Usage
1. Start the Flask app: 1. Start the Flask app, skip this part if you use Docker:
``` ```
python run.py python run.py

60
run.py
View File

@@ -1,12 +1,29 @@
from selenium import webdriver from selenium import webdriver
from flask import Flask, request, redirect from flask import Flask, redirect
import concurrent.futures
import re import re
from collections import OrderedDict from collections import OrderedDict
import time import time
# Define the maximum cache size and duration in seconds (4 hours)
MAX_CACHE_SIZE = 50
CACHE_DURATION = 4 * 60 * 60 # 4 hours in seconds
cache = OrderedDict(maxlen=MAX_CACHE_SIZE)
# Validate query, modify this regex as needed
VALID_QUERY_REGEX = re.compile(r'^[\w\-\.\/]+$')
app = Flask(__name__) app = Flask(__name__)
def invalidate_old_entries():
current_time = time.time()
one_hour_ago = current_time - 3600 # 1 hour in seconds
# Iterate over a copy of the keys to avoid modifying while iterating
for key in list(cache.keys()):
timestamp, _ = cache[key]
if timestamp < one_hour_ago:
del cache[key]
# Define the base URL for scraping # Define the base URL for scraping
base_url = "https://instagram.com" # Replace with your actual base URL base_url = "https://instagram.com" # Replace with your actual base URL
@@ -18,32 +35,32 @@ options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080') options.add_argument('--window-size=1920,1080')
options.add_argument('--no-sandbox') options.add_argument('--no-sandbox')
options.add_argument(f'user-agent={user_agent}') options.add_argument(f'user-agent={user_agent}')
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver", options=options) browser = webdriver.Chrome(options=options)
# Define the maximum cache size and duration in seconds (4 hours)
MAX_CACHE_SIZE = 50
CACHE_DURATION = 4 * 60 * 60 # 4 hours in seconds
cache = OrderedDict(maxlen=MAX_CACHE_SIZE)
# Validate query, modify this regex as needed
VALID_QUERY_REGEX = re.compile(r'^[\w\-\.\/]+$')
# Function to handle web scraping using Selenium # Function to handle web scraping using Selenium
def get_video_source(query_string): def get_video_source(query_string):
try: try:
browser.delete_all_cookies() browser.delete_all_cookies()
query_string = "/" + query_string query_string = "/" + query_string
url = f"{base_url}{query_string}" # Combine base URL and video ID url = f"{base_url}{query_string}" # Combine base URL and video ID
browser.get(url) browser.get(url)
# Replace sleep with explicit wait if possible # Replace sleep with explicit wait if possible
browser.implicitly_wait(4) browser.implicitly_wait(10)
browser.save_screenshot('ss_ig_reel.png')
# Locate the video element using your specific xpath # Locate the video element using your specific xpath
try:
# Reels
video_element = browser.find_element_by_xpath( video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div[1]/article/div/div[1]/div/div/div/div/div/div/div/video" "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div[1]/article/div/div[1]/div/div/div/div/div/div/div/video"
) )
except:
# Post (image)
video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[1]/div/div/div/div/div/div/div[1]/img"
)
# Get the video source and return it # Get the video source and return it
video_source = video_element.get_attribute("src") video_source = video_element.get_attribute("src")
@@ -51,8 +68,12 @@ def get_video_source(query_string):
except Exception as e: except Exception as e:
# Handle exceptions and return a default URL or re-raise the exception # Handle exceptions and return a default URL or re-raise the exception
print("Error: ")
print(e)
browser.get("https://api.dev.gabrielkheisa.xyz/")
return base_url return base_url
@app.route("/", methods=["GET"]) # Route for empty query string @app.route("/", methods=["GET"]) # Route for empty query string
def handle_empty_query(): def handle_empty_query():
return redirect("https://github.com/gabrielkheisa/instagram-downloader") return redirect("https://github.com/gabrielkheisa/instagram-downloader")
@@ -61,7 +82,7 @@ def handle_empty_query():
def get_video_source_server(query_string): def get_video_source_server(query_string):
global cache # Ensure we reference the global cache variable global cache # Ensure we reference the global cache variable
print(query_string) print(query_string)
if len(query_string) > 30: if len(query_string) > 80:
return '', 204 return '', 204
if not VALID_QUERY_REGEX.match(query_string): if not VALID_QUERY_REGEX.match(query_string):
@@ -88,16 +109,13 @@ def get_video_source_server(query_string):
cache[query_string] = video_source cache[query_string] = video_source
return redirect(video_source["url"]) return redirect(video_source["url"])
# Create a ThreadPoolExecutor for parallel execution with a timeout of 15 seconds # Get the video source sequentially
with concurrent.futures.ThreadPoolExecutor() as executor: video_source = get_video_source(query_string)
future = executor.submit(get_video_source, query_string)
try:
video_source = future.result(timeout=15) # Timeout set to 15 seconds
# Add the new entry to the cache with a timestamp # Add the new entry to the cache with a timestamp
cache[query_string] = {"url": video_source, "timestamp": time.time()} cache[query_string] = {"url": video_source, "timestamp": time.time()}
return redirect(video_source) return redirect(video_source)
except concurrent.futures.TimeoutError:
return redirect(base_url) # Handle timeout - return a default URL or handle as needed
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=False, port=8080, host="0.0.0.0") app.run(debug=False, port=8080, host="0.0.0.0")