Compare commits

...

3 Commits

Author SHA1 Message Date
gabrielkheisa
2cb5041524 update 2024-04-14 15:17:13 +07:00
Gabriel Kheisa
e3d935f6da
Create docker-image.yml 2024-04-14 14:59:48 +07:00
Gabriel Kheisa
107c837fe8
Docker deployment (#5)
* remove parallelism

* Deployment using Docker
2024-04-14 14:42:28 +07:00
4 changed files with 127 additions and 30 deletions

23
.github/workflows/docker-image.yml vendored Normal file
View File

@ -0,0 +1,23 @@
name: Build and Publish Docker Image
on:
push:
branches:
- main # Trigger the workflow on pushes to the main branch
jobs:
publish:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Login to GitHub Packages Docker registry
run: echo "${{ secrets.GH_TOKEN }}" | docker login docker.pkg.github.com -u ${{ github.repository_owner }} --password-stdin
- name: Build Docker image
run: docker build -t docker.pkg.github.com/${{ github.repository }}/${{ github.repository }}:latest .
- name: Push Docker image
run: docker push docker.pkg.github.com/${{ github.repository }}/${{ github.repository }}:latest

28
Dockerfile Normal file
View File

@ -0,0 +1,28 @@
# Use the official Selenium standalone Chrome image as base
FROM selenium/standalone-chrome:112.0.5615.165-chromedriver-112.0.5615.49
ENV DEBIAN_FRONTEND=noninteractive
# Update package lists
RUN sudo apt-get update
# Install Python 3.6 and pip
RUN sudo apt-get install -y python3
RUN sudo apt-get install -y python3-pip
# Install Selenium and any other Python dependencies you may need
RUN sudo pip install selenium==3.141.0
RUN sudo pip install --upgrade urllib3==1.26.16
RUN sudo pip install Flask
# Set the working directory
WORKDIR /usr/src/app
# Copy your Python scripts into the container
COPY . .
EXPOSE 8080
# Example command to run your Python script
CMD ["python3", "run.py"]

View File

@ -66,10 +66,38 @@ This micro web server does not directly download the Instagram Reels video. It s
git clone https://github.com/gabrielkheisa/instagram-downloader.git
```
### With Dockerfile
1. Build the Dockerfile
```
sudo docker build -t instagram-downloader .
```
2. Run the container
```
sudo docker run -d -p 8080:8080 instagram-downloader
```
### With existing Docker Image
1. Pull the Docker image
```
sudo docker pull ghcr.io/gabrielkheisa/instagram-downloader/gabrielkheisa/instagram-downloader:latest
```
2. Run the container
```
sudo docker run -d -p 8080:8080 ghcr.io/gabrielkheisa/instagram-downloader/gabrielkheisa/instagram-downloader
```
## Usage
1. Start the Flask app:
1. Start the Flask app, skip this part if you use Docker:
```
python run.py

76
run.py
View File

@ -1,12 +1,29 @@
from selenium import webdriver
from flask import Flask, request, redirect
import concurrent.futures
from flask import Flask, redirect
import re
from collections import OrderedDict
import time
# Define the maximum cache size and duration in seconds (4 hours)
MAX_CACHE_SIZE = 50
CACHE_DURATION = 4 * 60 * 60 # 4 hours in seconds
cache = OrderedDict(maxlen=MAX_CACHE_SIZE)
# Validate query, modify this regex as needed
VALID_QUERY_REGEX = re.compile(r'^[\w\-\.\/]+$')
app = Flask(__name__)
def invalidate_old_entries():
current_time = time.time()
one_hour_ago = current_time - 3600 # 1 hour in seconds
# Iterate over a copy of the keys to avoid modifying while iterating
for key in list(cache.keys()):
timestamp, _ = cache[key]
if timestamp < one_hour_ago:
del cache[key]
# Define the base URL for scraping
base_url = "https://instagram.com" # Replace with your actual base URL
@ -18,32 +35,32 @@ options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080')
options.add_argument('--no-sandbox')
options.add_argument(f'user-agent={user_agent}')
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver", options=options)
# Define the maximum cache size and duration in seconds (4 hours)
MAX_CACHE_SIZE = 50
CACHE_DURATION = 4 * 60 * 60 # 4 hours in seconds
cache = OrderedDict(maxlen=MAX_CACHE_SIZE)
# Validate query, modify this regex as needed
VALID_QUERY_REGEX = re.compile(r'^[\w\-\.\/]+$')
browser = webdriver.Chrome(options=options)
# Function to handle web scraping using Selenium
def get_video_source(query_string):
try:
browser.delete_all_cookies()
query_string = "/" + query_string
url = f"{base_url}{query_string}" # Combine base URL and video ID
browser.get(url)
# Replace sleep with explicit wait if possible
browser.implicitly_wait(4)
browser.implicitly_wait(10)
browser.save_screenshot('ss_ig_reel.png')
# Locate the video element using your specific xpath
video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div[1]/article/div/div[1]/div/div/div/div/div/div/div/video"
)
try:
# Reels
video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div[1]/article/div/div[1]/div/div/div/div/div/div/div/video"
)
except:
# Post (image)
video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[1]/div/div/div/div/div/div/div[1]/img"
)
# Get the video source and return it
video_source = video_element.get_attribute("src")
@ -51,8 +68,12 @@ def get_video_source(query_string):
except Exception as e:
# Handle exceptions and return a default URL or re-raise the exception
print("Error: ")
print(e)
browser.get("https://api.dev.gabrielkheisa.xyz/")
return base_url
@app.route("/", methods=["GET"]) # Route for empty query string
def handle_empty_query():
return redirect("https://github.com/gabrielkheisa/instagram-downloader")
@ -61,7 +82,7 @@ def handle_empty_query():
def get_video_source_server(query_string):
global cache # Ensure we reference the global cache variable
print(query_string)
if len(query_string) > 30:
if len(query_string) > 80:
return '', 204
if not VALID_QUERY_REGEX.match(query_string):
@ -88,16 +109,13 @@ def get_video_source_server(query_string):
cache[query_string] = video_source
return redirect(video_source["url"])
# Create a ThreadPoolExecutor for parallel execution with a timeout of 15 seconds
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(get_video_source, query_string)
try:
video_source = future.result(timeout=15) # Timeout set to 15 seconds
# Add the new entry to the cache with a timestamp
cache[query_string] = {"url": video_source, "timestamp": time.time()}
return redirect(video_source)
except concurrent.futures.TimeoutError:
return redirect(base_url) # Handle timeout - return a default URL or handle as needed
# Get the video source sequentially
video_source = get_video_source(query_string)
# Add the new entry to the cache with a timestamp
cache[query_string] = {"url": video_source, "timestamp": time.time()}
return redirect(video_source)
if __name__ == "__main__":
app.run(debug=False, port=8080, host="0.0.0.0")
app.run(debug=False, port=8080, host="0.0.0.0")