From 5ddb8ffabe824f70424518ec1c48ed0ef51ba6eb Mon Sep 17 00:00:00 2001 From: gabrielkheisa Date: Sun, 10 Dec 2023 13:03:57 +0700 Subject: [PATCH] add --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ run.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 README.md create mode 100644 run.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..99bc235 --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +# Instagram Video Downloader + +This micro web server allows you to download Instagram Reels videos by providing their query parameters. It utilizes Selenium to control a Chromium browser instance and navigate to the Instagram webpage. Once the video is loaded, it extracts the CDN link and redirects you to it. + +### Usage: +``` +https://ig.gabrielkheisa.xyz/ +``` +### Example: +**Source** +``` +https://www.instagram.com/reel/Cz3dNmDMVC9/?igshid=MzRlODBiNWFlZA== +``` +**Replace** +``` +https://ig.gabrielkheisa.xyz/reel/Cz3dNmDMVC9/?igshid=MzRlODBiNWFlZA== +``` +### Returns redirect: +``` +https://scontent.cdninstagram.com/v/t66.30100-16/316926421_1723935788092224_3596729375098306652_n.mp4?_nc_ht=scontent.cdninstagram.com&_nc_cat=100&_nc_ohc=6lyBPVcjJkYAX8kLe3I&edm=APs17CUBAAAA&ccb=7-5&oh=00_AfBNGf7HzFPnd-mhfvhZZZRk_-PlN3qx3hqbsINaUGA4aA&oe=6576D61D&_nc_sid=10d13b +``` + +## DISCLAIMER: + +This micro web server does not directly download the Instagram Reels video. It simply locates the Instagram CDN link for the video and redirects you to it. Therefore, it is not technically a "downloader" but rather a **CDN link extractor and redirector**. + +## Tech stack + +* **Python 3.6:** Programming language +* **Selenium:** Web automation framework +* **Chromium browser:** Web browser, run in headless mode +* **Flask:** Micro web server + +## Requirements + +* Python 3.6+ +* Selenium +* Chromium browser +* Flask + +## Installation + +1. Install Python 3.6 or newer. +2. Install Selenium: + + ``` + pip install selenium + ``` + +3. Install Chromium browser: + + ``` + sudo apt install chromium-browser + ``` + +4. Install Flask: + + ``` + pip install Flask + ``` + +5. Clone this repository: + + ``` + git clone https://github.com/gabrielkheisa/instagram-downloader.git + ``` + + +## Usage + +1. Start the Flask app: + + ``` + python run.py + ``` + +2. Open a web browser and navigate to http://localhost:8080/. +3. Add the query parameters of your Instagram Reels endpoint, for example for the original Instagram URL: +``` +https://www.instagram.com/reel/Cz3dNmDMVC9/?igshid=MzRlODBiNWFlZA== +``` +To be filled with: +``` +http://localhost:8080/reel/Cz3dNmDMVC9/?igshid=MzRlODBiNWFlZA== +``` +4. After you have something like https://scontent.cdninstagram.com/v/t66.30..., simply download the video +5. If the Instagram Reels video exists or no exception or error occurs, you will be redirected to the Instagram CDN endpoint link where you can download the video directly, else it will redirect to instagram.com + +## Limitations + +* It takes 3 to 5 seconds for the Xpath in the remote URL (instagram.com) to be loaded properly, so delay is **implicitly inserted** in the webdriver, making the request relatively longer for each invocation. +* It's possible that Instagram will change their Xpath website structure in the future, so you need to find the new Xpath location. Current Xpath and property: +``` +/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div/article/div/div[1]/div/div/div/div/div/div/div/video +``` +``` +src +``` diff --git a/run.py b/run.py new file mode 100644 index 0000000..d2e36ec --- /dev/null +++ b/run.py @@ -0,0 +1,58 @@ +from selenium import webdriver +from flask import Flask, request, redirect +import concurrent.futures + +app = Flask(__name__) + +# Define the base URL for scraping +base_url = "https://instagram.com" # Replace with your actual base URL + +# Initialize WebDriver globally +user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' +options = webdriver.ChromeOptions() +options.add_argument('--headless') +options.add_argument('--disable-gpu') +options.add_argument('--window-size=1920,1080') +options.add_argument('--no-sandbox') +options.add_argument(f'user-agent={user_agent}') +browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver", options=options) + +# Function to handle web scraping using Selenium +def get_video_source(query_string): + try: + browser.delete_all_cookies() + + query_string = "/" + query_string + url = f"{base_url}{query_string}" # Combine base URL and video ID + browser.get(url) + + # Replace sleep with explicit wait if possible + browser.implicitly_wait(4) + + # Locate the video element using your specific xpath + video_element = browser.find_element_by_xpath( + "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div/article/div/div[1]/div/div/div/div/div/div/div/video" + ) + + # Get the video source and return it + video_source = video_element.get_attribute("src") + return video_source + + except Exception as e: + # Handle exceptions and return a default URL or re-raise the exception + return base_url + +@app.route("/", methods=["GET"]) +def get_video_source_server(query_string): + # Create a ThreadPoolExecutor for parallel execution with a timeout of 3 seconds + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(get_video_source, query_string) + try: + video_source = future.result(timeout=10) # Timeout set to 3 seconds + return redirect(video_source) + except concurrent.futures.TimeoutError: + # Handle timeout - return a default URL or handle as needed + return redirect(base_url) + +if __name__ == "__main__": + app.run(debug=False, port=8080, host="0.0.0.0") \ No newline at end of file