Update xpath

Update run.py
update
2025-09-13 06:41:09 +00:00 · 2024-05-14 03:05:10 +07:00 · 2024-04-23 23:18:15 +07:00 · 2024-04-14 15:17:13 +07:00 · 2024-04-14 14:59:48 +07:00 · 2024-04-14 14:42:28 +07:00
4 changed files with 159 additions and 28 deletions
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,23 @@
+name: Build and Publish Docker Image
+
+on:
+  push:
+    branches:
+      - main  # Trigger the workflow on pushes to the main branch
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Login to GitHub Packages Docker registry
+        run: echo "${{ secrets.GH_TOKEN }}" | docker login docker.pkg.github.com -u ${{ github.repository_owner }} --password-stdin
+
+      - name: Build Docker image
+        run: docker build -t docker.pkg.github.com/${{ github.repository }}/${{ github.repository }}:latest .
+
+      - name: Push Docker image
+        run: docker push docker.pkg.github.com/${{ github.repository }}/${{ github.repository }}:latest
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
+# Use the official Selenium standalone Chrome image as base
+FROM selenium/standalone-chrome:112.0.5615.165-chromedriver-112.0.5615.49
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update package lists
+RUN sudo apt-get update
+
+# Install Python 3.6 and pip
+RUN sudo apt-get install -y python3
+
+RUN sudo apt-get install -y python3-pip
+
+# Install Selenium and any other Python dependencies you may need
+RUN sudo pip install selenium==3.141.0
+RUN sudo pip install --upgrade urllib3==1.26.16
+RUN sudo pip install Flask
+
+# Set the working directory
+WORKDIR /usr/src/app
+
+# Copy your Python scripts into the container
+COPY . .
+
+EXPOSE 8080
+
+# Example command to run your Python script
+CMD ["python3", "run.py"]
--- a/README.md
+++ b/README.md
@@ -17,8 +17,9 @@ https://ig.gabrielkheisa.xyz/reel/Cz3dNmDMVC9/?igshid=MzRlODBiNWFlZA==
 ```
 ### Returns redirect:
 ```
-https://scontent.cdninstagram.com/v/t66.30100-16/316926421_1723935788092224_3596729375098306652_n.mp4?_nc_ht=scontent.cdninstagram.com&_nc_cat=100&_nc_ohc=6lyBPVcjJkYAX8kLe3I&edm=APs17CUBAAAA&ccb=7-5&oh=00_AfBNGf7HzFPnd-mhfvhZZZRk_-PlN3qx3hqbsINaUGA4aA&oe=6576D61D&_nc_sid=10d13b
+https://scontent.cdninstagram.com/v/t66.30100-16/316926421_1723935788092224_3596729375098306652_n.mp4?_nc_ht=scontent.cdninstagram.com&_nc_cat=100&_nc_ohc=6lyBPVcj...............
 ```
+Notes: This redirect URL (https://scontent.cdninstagram.com/...) has expiration, in which you need to re-run the query to get a new URL signature

 ## DISCLAIMER:

@@ -65,10 +66,38 @@ This micro web server does not directly download the Instagram Reels video. It s
    git clone https://github.com/gabrielkheisa/instagram-downloader.git
    ```

+### With Dockerfile
+
+1. Build the Dockerfile
+
+    ```
+    sudo docker build -t instagram-downloader .
+    ```
+
+2. Run the container
+
+    ```
+    sudo docker run -d -p 8080:8080 instagram-downloader
+    ```
+
+### With existing Docker Image
+
+1. Pull the Docker image
+
+    ```
+    sudo docker pull ghcr.io/gabrielkheisa/instagram-downloader/gabrielkheisa/instagram-downloader:latest
+    ```
+
+2. Run the container
+
+    ```
+    sudo docker run -d -p 8080:8080 ghcr.io/gabrielkheisa/instagram-downloader/gabrielkheisa/instagram-downloader
+    ```
+

 ## Usage

-1. Start the Flask app:
+1. Start the Flask app, skip this part if you use Docker:

    ```
    python run.py
--- a/run.py
+++ b/run.py
@@ -1,10 +1,29 @@
 from selenium import webdriver
-from flask import Flask, request, redirect
-import concurrent.futures
+from flask import Flask, redirect
+import re
 from collections import OrderedDict
+import time
+
+# Define the maximum cache size and duration in seconds (4 hours)
+MAX_CACHE_SIZE = 50
+CACHE_DURATION = 4 * 60 * 60  # 4 hours in seconds
+cache = OrderedDict(maxlen=MAX_CACHE_SIZE)
+
+# Validate query, modify this regex as needed
+VALID_QUERY_REGEX = re.compile(r'^[\w\-\.\/]+$') 

 app = Flask(__name__)

+def invalidate_old_entries():
+    current_time = time.time()
+    one_hour_ago = current_time - 3600  # 1 hour in seconds
+
+    # Iterate over a copy of the keys to avoid modifying while iterating
+    for key in list(cache.keys()):
+        timestamp, _ = cache[key]
+        if timestamp < one_hour_ago:
+            del cache[key]
+
 # Define the base URL for scraping
 base_url = "https://instagram.com"  # Replace with your actual base URL

@@ -16,26 +35,32 @@ options.add_argument('--disable-gpu')
 options.add_argument('--window-size=1920,1080')
 options.add_argument('--no-sandbox')
 options.add_argument(f'user-agent={user_agent}')
-browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver", options=options) 
-
-cache = OrderedDict(maxlen=50)
+browser = webdriver.Chrome(options=options) 

 # Function to handle web scraping using Selenium
 def get_video_source(query_string):
    try:
        browser.delete_all_cookies()
-
        query_string = "/" + query_string
        url = f"{base_url}{query_string}"  # Combine base URL and video ID
        browser.get(url)

        # Replace sleep with explicit wait if possible
-        browser.implicitly_wait(4)
+        browser.implicitly_wait(10)
+
+        browser.save_screenshot('ss_ig_reel.png')

        # Locate the video element using your specific xpath
-        video_element = browser.find_element_by_xpath(
-            "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div/article/div/div[1]/div/div/div/div/div/div/div/video"
-        )
+        try:
+            # Reels
+            video_element = browser.find_element_by_xpath(
+                "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div[1]/article/div/div[1]/div/div/div/div/div/div/div/video"
+            )
+        except:
+            # Post (image)
+            video_element = browser.find_element_by_xpath(
+                "/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[1]/div/div/div/div/div/div/div[1]/img"
+            )

        # Get the video source and return it
        video_source = video_element.get_attribute("src")
@@ -43,28 +68,54 @@ def get_video_source(query_string):

    except Exception as e:
        # Handle exceptions and return a default URL or re-raise the exception
+        print("Error: ")
+        print(e)
+        browser.get("https://api.dev.gabrielkheisa.xyz/")
        return base_url

+
+@app.route("/", methods=["GET"])  # Route for empty query string
+def handle_empty_query():
+    return redirect("https://github.com/gabrielkheisa/instagram-downloader")
+
@app.route("/<path:query_string>", methods=["GET"])
 def get_video_source_server(query_string):
-    if len(query_string) > 30:
-        # Reject the request by returning a 414 error code
-        return abort(414, description="Query string too long")
+    global cache  # Ensure we reference the global cache variable
+    print(query_string)
+    if len(query_string) > 80:
+        return '', 204
+
+    if not VALID_QUERY_REGEX.match(query_string):
+        return "Invalid link", 400
+
+
+    # Clean up entries older than 4 hours
+    current_time = time.time()
+    keys_to_remove = []
+    for key in list(cache.keys()):
+        value = cache[key]
+        if isinstance(value, dict) and "timestamp" in value:
+            timestamp = value["timestamp"]
+            if current_time - timestamp >= CACHE_DURATION:
+                keys_to_remove.append(key)
+
+    for key in keys_to_remove:
+        cache.pop(key, None)
+
    if query_string in cache:
-        # If cached, move to the front of the OrderedDict to update its age
+        # Move the existing entry to the front of the cache and update its timestamp
        video_source = cache.pop(query_string)
+        video_source["timestamp"] = time.time()
        cache[query_string] = video_source
-        return redirect(video_source)
-    # Create a ThreadPoolExecutor for parallel execution with a timeout of 3 seconds
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        future = executor.submit(get_video_source, query_string)
-        try:
-            video_source = future.result(timeout=10)  # Timeout set to 3 seconds
-            cache[query_string] = video_source
-            return redirect(video_source)
-        except concurrent.futures.TimeoutError:
-            # Handle timeout - return a default URL or handle as needed
-            return redirect(base_url)
+        return redirect(video_source["url"])
+
+    # Get the video source sequentially
+    video_source = get_video_source(query_string)
+
+    # Add the new entry to the cache with a timestamp
+    cache[query_string] = {"url": video_source, "timestamp": time.time()}
+
+    return redirect(video_source)

 if __name__ == "__main__":
-    app.run(debug=False, port=8080, host="0.0.0.0")
+    app.run(debug=False, port=8080, host="0.0.0.0")
Author	SHA1	Message	Date
Gabriel Kheisa	c83b842968	Update xpath	2024-05-14 03:05:10 +07:00
Gabriel Kheisa	c293cf5e67	Update run.py	2024-04-23 23:18:15 +07:00
gabrielkheisa	2cb5041524	update	2024-04-14 15:17:13 +07:00
Gabriel Kheisa	e3d935f6da	Create docker-image.yml	2024-04-14 14:59:48 +07:00
Gabriel Kheisa	107c837fe8	Docker deployment (#5 ) * remove parallelism * Deployment using Docker	2024-04-14 14:42:28 +07:00
gabrielkheisa	78b6f3dc7e	renew xpath and extend timeout update	2024-01-04 23:27:41 +07:00
gabrielkheisa	6a581c917f	URL validation	2023-12-19 16:34:37 +07:00
Gabriel Kheisa	9deb70acc1	Merge pull request #2 from gabrielkheisa/dev	2023-12-19 15:47:15 +07:00
gabrielkheisa	595c1e54de	update	2023-12-19 15:38:08 +07:00