instagram-downloader/run.py

122 lines
4.1 KiB
Python
Raw Permalink Normal View History

2023-12-10 13:03:57 +07:00
from selenium import webdriver
2024-04-14 13:17:07 +07:00
from flask import Flask, redirect
2023-12-19 16:34:37 +07:00
import re
2023-12-10 16:00:17 +07:00
from collections import OrderedDict
2023-12-19 15:38:08 +07:00
import time
2023-12-10 13:03:57 +07:00
2024-04-14 13:17:07 +07:00
# Define the maximum cache size and duration in seconds (4 hours)
MAX_CACHE_SIZE = 50
CACHE_DURATION = 4 * 60 * 60 # 4 hours in seconds
cache = OrderedDict(maxlen=MAX_CACHE_SIZE)
# Validate query, modify this regex as needed
VALID_QUERY_REGEX = re.compile(r'^[\w\-\.\/]+$')
2023-12-10 13:03:57 +07:00
app = Flask(__name__)
2024-04-14 13:17:07 +07:00
def invalidate_old_entries():
current_time = time.time()
one_hour_ago = current_time - 3600 # 1 hour in seconds
# Iterate over a copy of the keys to avoid modifying while iterating
for key in list(cache.keys()):
timestamp, _ = cache[key]
if timestamp < one_hour_ago:
del cache[key]
2023-12-10 13:03:57 +07:00
# Define the base URL for scraping
base_url = "https://instagram.com" # Replace with your actual base URL
# Initialize WebDriver globally
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080')
options.add_argument('--no-sandbox')
options.add_argument(f'user-agent={user_agent}')
2024-04-14 14:39:20 +07:00
browser = webdriver.Chrome(options=options)
2023-12-10 13:03:57 +07:00
# Function to handle web scraping using Selenium
def get_video_source(query_string):
try:
browser.delete_all_cookies()
query_string = "/" + query_string
url = f"{base_url}{query_string}" # Combine base URL and video ID
browser.get(url)
# Replace sleep with explicit wait if possible
2024-04-14 13:17:07 +07:00
browser.implicitly_wait(10)
browser.save_screenshot('ss_ig_reel.png')
2023-12-10 13:03:57 +07:00
# Locate the video element using your specific xpath
2024-04-14 13:17:07 +07:00
try:
# Reels
video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/div[1]/div[1]/article/div/div[1]/div/div/div/div/div/div/div/video"
)
except:
# Post (image)
video_element = browser.find_element_by_xpath(
"/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[1]/div/div/div/div/div/div/div[1]/img"
)
2023-12-10 13:03:57 +07:00
# Get the video source and return it
video_source = video_element.get_attribute("src")
return video_source
except Exception as e:
# Handle exceptions and return a default URL or re-raise the exception
2024-04-14 13:17:07 +07:00
print("Error: ")
print(e)
browser.get("https://api.dev.gabrielkheisa.xyz/")
2023-12-10 13:03:57 +07:00
return base_url
2024-04-14 13:17:07 +07:00
2023-12-19 16:34:37 +07:00
@app.route("/", methods=["GET"]) # Route for empty query string
def handle_empty_query():
return redirect("https://github.com/gabrielkheisa/instagram-downloader")
2023-12-10 13:03:57 +07:00
@app.route("/<path:query_string>", methods=["GET"])
def get_video_source_server(query_string):
2023-12-19 15:38:08 +07:00
global cache # Ensure we reference the global cache variable
2023-12-19 16:34:37 +07:00
print(query_string)
2024-04-14 13:17:07 +07:00
if len(query_string) > 80:
2023-12-19 15:38:08 +07:00
return '', 204
2023-12-19 16:34:37 +07:00
if not VALID_QUERY_REGEX.match(query_string):
return "Invalid link", 400
2023-12-19 15:38:08 +07:00
# Clean up entries older than 4 hours
current_time = time.time()
keys_to_remove = []
for key in list(cache.keys()):
value = cache[key]
if isinstance(value, dict) and "timestamp" in value:
timestamp = value["timestamp"]
if current_time - timestamp >= CACHE_DURATION:
keys_to_remove.append(key)
for key in keys_to_remove:
cache.pop(key, None)
2023-12-10 16:00:17 +07:00
if query_string in cache:
2023-12-19 15:38:08 +07:00
# Move the existing entry to the front of the cache and update its timestamp
2023-12-10 16:00:17 +07:00
video_source = cache.pop(query_string)
2023-12-19 15:38:08 +07:00
video_source["timestamp"] = time.time()
2023-12-10 16:00:17 +07:00
cache[query_string] = video_source
2023-12-19 15:38:08 +07:00
return redirect(video_source["url"])
2024-04-14 13:17:07 +07:00
# Get the video source sequentially
video_source = get_video_source(query_string)
# Add the new entry to the cache with a timestamp
cache[query_string] = {"url": video_source, "timestamp": time.time()}
return redirect(video_source)
2023-12-10 13:03:57 +07:00
if __name__ == "__main__":
2024-04-14 13:17:07 +07:00
app.run(debug=False, port=8080, host="0.0.0.0")