mirror of
https://github.com/gabrielkheisa/news-scrapper.git
synced 2024-11-21 19:11:56 +07:00
first
This commit is contained in:
commit
a907ee8730
149
README.md
Normal file
149
README.md
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
<h1>Automated News Scrapper</h1>
|
||||||
|
<h3>Returns in form of JSON<h3>
|
||||||
|
<h2>Example:</h2>
|
||||||
|
<ul>
|
||||||
|
<li>https://api.gabrielkheisa.xyz/news/antaranews/</li>
|
||||||
|
<pre>
|
||||||
|
{
|
||||||
|
"server_update": "08-11-2022 12:50:44",
|
||||||
|
"news": [
|
||||||
|
{
|
||||||
|
"judul": "IKN development with Forest City concept to mitigate climate change",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://en.antaranews.com/news/259041/ikn-development-with-forest-city-concept-to-mitigate-climate-change",
|
||||||
|
"gambar": null,
|
||||||
|
"tgl": " 19 minutes ago"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Gov't should conduct COVID-19 test again amid XBB spread: Asmoro",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://en.antaranews.com/news/259037/govt-should-conduct-covid-19-test-again-amid-xbb-spread-asmoro",
|
||||||
|
"gambar": null,
|
||||||
|
"tgl": " 51 minutes ago"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Indonesian military builds six church bell towers in Papua",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://en.antaranews.com/news/208661/indonesian-military-builds-six-church-bell-towers-in-papua",
|
||||||
|
"gambar": null,
|
||||||
|
"tgl": " 9th January 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Ancol Dreamland, Asuransi Astra, Hilo, Janji Jiwa Jiwa Toast are among the winners of the 2022-2023 Brand of the Year Awards.",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://en.antaranews.com/news/259029/ancol-dreamland-asuransi-astra-hilo-janji-jiwa-jiwa-toast-are-among-the-winners-of-the-2022-2023-brand-of-the-year-awards",
|
||||||
|
"gambar": null,
|
||||||
|
"tgl": " 5 hours ago"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
<li>https://api.gabrielkheisa.xyz/news/coindesk/</li>
|
||||||
|
<pre>
|
||||||
|
{
|
||||||
|
"server_update": "08-11-2022 12:25:51",
|
||||||
|
"news": [
|
||||||
|
{
|
||||||
|
"judul": "Bitcoin, Ether Slide as Protective Puts Draw Demand Amid Sell-Off in FTX's Token",
|
||||||
|
"berita": "Options market tied to bitcoin and ether shows renewed bias for puts, perhaps a sign of investor fears that the FTX-Alameda drama may bring another market-wide crash.",
|
||||||
|
"URL": "https://www.coindesk.com/markets/2022/11/08/bitcoin-ether-slide-as-protective-puts-draw-demand-amid-sell-off-in-ftx-token/",
|
||||||
|
"gambar": "",
|
||||||
|
"tgl": "Nov 8, 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "FTX Token Plummets as Market Fears Possible Alameda Contagion",
|
||||||
|
"berita": "FTT was down nearly 12% in the last hour and over 20% during the last 24 hours.",
|
||||||
|
"URL": "https://www.coindesk.com/markets/2022/11/08/ftt-plummets-as-market-fears-possible-alameda-contagion/",
|
||||||
|
"gambar": "",
|
||||||
|
"tgl": "Nov 8, 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "First Mover Asia: A Good Week for Exchange Tokens, Except FTT; Solana Continues Falling",
|
||||||
|
"berita": "During the past week, a number of exchange tokens have outperformed bitcoin, including OKX and CRO. FTT is not among them.",
|
||||||
|
"URL": "https://www.coindesk.com/markets/2022/11/08/first-mover-asia-a-good-week-for-exchange-tokens-except-ftt-solana-continues-falling/",
|
||||||
|
"gambar": "",
|
||||||
|
"tgl": "Nov 8, 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Market Wrap: Solana Plunge Highlights Major Cryptos\u2019 Day in the Red",
|
||||||
|
"berita": "The native token of the Solana protocol recently fell over 6%; bitcoin and ether dropped more modestly as investors await the midterm elections and latest inflation data. Market Wrap is CoinDesk\u2019s daily newsletter diving into what happened in today's crypto markets.",
|
||||||
|
"URL": "https://www.coindesk.com/markets/2022/11/07/market-wrap-solana-plunge-highlights-major-cryptos-day-in-the-red/",
|
||||||
|
"gambar": "",
|
||||||
|
"tgl": "Nov 8, 2022"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
<li>https://api.gabrielkheisa.xyz/news/gsmarena/</li>
|
||||||
|
<pre>
|
||||||
|
{
|
||||||
|
"server_update": "08-11-2022 12:42:33",
|
||||||
|
"news": [
|
||||||
|
{
|
||||||
|
"judul": "Google releases stable November Android 13 update, new QPR1 Beta 3.1 too",
|
||||||
|
"berita": "As it was just the first Monday of the month, Google released the latest monthly Android update for its still-supported Pixels mere hours ago. The November 2022 update is the last minor one before the bigger December release - which has been in open...",
|
||||||
|
"URL": "https://www.gsmarena.com/google_releases_stable_november_android_13_update_new_qpr1_beta_31_too-news-56434.php",
|
||||||
|
"gambar": "https://fdn.gsmarena.com/imgroot/news/22/11/google-november-update/-344x215/gsmarena_000.jpg",
|
||||||
|
"tgl": "08 November 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Google reveals its upcoming Black Friday deals for Pixel 7, Pixel 7 Pro, and Pixel 6a",
|
||||||
|
"berita": "Google is very excited about Black Friday this year, and as such it's letting us know in advance what the deals will be. Over at its online store, the company is now prominently displaying a countdown clock to the start of the deals. At the time of...",
|
||||||
|
"URL": "https://www.gsmarena.com/google_reveals_its_upcoming_black_friday_deals_for_pixel_7_pixel_7_pro_and_pixel_6a-news-56433.php",
|
||||||
|
"gambar": "https://fdn.gsmarena.com/imgroot/news/22/11/google-black-friday-deals/-344x215/gsmarena_000.jpg",
|
||||||
|
"tgl": "08 November 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Samsung sets new speed record over 5G - 1.75Gbps at 10km distance",
|
||||||
|
"berita": "Samsung Electronics announced that it reached record-breaking transfer speeds over a 5G mmWave network. The tests were carried out in partnership with NBN Co. a company that is part of an AUD 750 million investment plan in Australia. NBN is using...",
|
||||||
|
"URL": "https://www.gsmarena.com/samsung_reaches_insane_download_speeds_over_5g_10km_away_from_source-news-56432.php",
|
||||||
|
"gambar": "https://fdn.gsmarena.com/imgroot/news/20/10/samsung-q3-report/-344x215/gsmarena_001.jpg",
|
||||||
|
"tgl": "07 November 2022"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "Xiaomi Redmi K60 confirmed to support 67W fast charging",
|
||||||
|
"berita": "Xiaomi's upcoming Redmi K60 flagship series is set to debut in the coming weeks, although there's still no official date set. The intensity of recent leaks suggests that's fast approaching, and so do the certification documents that are now flying...",
|
||||||
|
"URL": "https://www.gsmarena.com/xiaomi_redmi_k60_confirmed_to_support_67w_fast_charging-news-56431.php",
|
||||||
|
"gambar": "https://fdn.gsmarena.com/imgroot/news/22/03/xiaomi-redmi-k50-colors/-344x215/gsmarena_001.jpg",
|
||||||
|
"tgl": "07 November 2022"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
<li>https://api.gabrielkheisa.xyz/news/theverge/</li>
|
||||||
|
<pre>
|
||||||
|
{
|
||||||
|
"server_update": "08-11-2022 12:54:01",
|
||||||
|
"news": [
|
||||||
|
{
|
||||||
|
"judul": "Elon Musk has discussed putting all of Twitter behind a paywall",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://www.theverge.com/2022/11/7/23446262/elon-musk-twitter-paywall-possible",
|
||||||
|
"gambar": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7",
|
||||||
|
"tgl": "8:52 AM GMT+7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "AMC is working with Zoom to turn some theaters into giant meeting rooms",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://www.theverge.com/2022/11/7/23446136/amc-zoom-rooms-theaters-meetings",
|
||||||
|
"gambar": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7",
|
||||||
|
"tgl": "8:38 AM GMT+7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "How am I supposed to mark myself as parody if I can\u2019t change my screen name, Elon?",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://www.theverge.com/2022/11/7/23446171/screen-name-twitter-musk-parody-whoops",
|
||||||
|
"gambar": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7",
|
||||||
|
"tgl": "8:32 AM GMT+7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": "T-Mobile may be looking to spend big on fiber home internet",
|
||||||
|
"berita": "",
|
||||||
|
"URL": "https://www.theverge.com/2022/11/7/23445777/t-mobile-home-internet-fiber-5g-partnership-search",
|
||||||
|
"gambar": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7",
|
||||||
|
"tgl": "6:09 AM GMT+7"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
</ul>
|
84
index.php
Normal file
84
index.php
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
header('Content-Type: application/json; charset=utf-8');
|
||||||
|
|
||||||
|
function json_indent($json)
|
||||||
|
{
|
||||||
|
$result = '';
|
||||||
|
$pos = 0;
|
||||||
|
$strLen = strlen($json);
|
||||||
|
$indentStr = ' ';
|
||||||
|
$newLine = "\n";
|
||||||
|
$prevChar = '';
|
||||||
|
$outOfQuotes = true;
|
||||||
|
|
||||||
|
for ($i=0; $i<=$strLen; $i++)
|
||||||
|
{
|
||||||
|
// Grab the next character in the string.
|
||||||
|
$char = substr($json, $i, 1);
|
||||||
|
|
||||||
|
// Are we inside a quoted string?
|
||||||
|
if ($char == '"' && $prevChar != '\\') {
|
||||||
|
$outOfQuotes = !$outOfQuotes;
|
||||||
|
|
||||||
|
// If this character is the end of an element,
|
||||||
|
// output a new line and indent the next line.
|
||||||
|
} else if(($char == '}' || $char == ']') && $outOfQuotes) {
|
||||||
|
$result .= $newLine;
|
||||||
|
$pos --;
|
||||||
|
for ($j=0; $j<$pos; $j++) {
|
||||||
|
$result .= $indentStr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the character to the result string.
|
||||||
|
$result .= $char;
|
||||||
|
|
||||||
|
// If the last character was the beginning of an element,
|
||||||
|
// output a new line and indent the next line.
|
||||||
|
if (($char == ',' || $char == '{' || $char == '[') && $outOfQuotes) {
|
||||||
|
$result .= $newLine;
|
||||||
|
if ($char == '{' || $char == '[') {
|
||||||
|
$pos ++;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ($j = 0; $j < $pos; $j++) {
|
||||||
|
$result .= $indentStr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$prevChar = $char;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
$news = $_REQUEST["news"];
|
||||||
|
$keyinput = $_REQUEST["key"];
|
||||||
|
$key = "API_KEY";
|
||||||
|
|
||||||
|
if($keyinput == ""){
|
||||||
|
$myfile = fopen("cache.txt", "r") or die("Unable to open file!");
|
||||||
|
$teks = fread($myfile,filesize("cache.txt"));
|
||||||
|
fclose($myfile);
|
||||||
|
|
||||||
|
//echo json_encode($teks, JSON_PRETTY_PRINT);
|
||||||
|
echo json_indent($teks);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$txt = $news;
|
||||||
|
|
||||||
|
|
||||||
|
if(strcmp($keyinput, $key) == 0) {
|
||||||
|
$myfile = fopen("cache.txt", "w") or die("Unable to open file!");
|
||||||
|
fwrite($myfile, $txt);
|
||||||
|
fclose($myfile);
|
||||||
|
echo "Done!";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
echo "Key Invalid";
|
||||||
|
fclose($myfile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
88
script.py
Normal file
88
script.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
|
||||||
|
key = "API_KEY"
|
||||||
|
|
||||||
|
while(1):
|
||||||
|
try:
|
||||||
|
now = datetime.now()
|
||||||
|
dt_string = now.strftime("%d-%m-%Y %H:%M:%S")
|
||||||
|
|
||||||
|
options = Options()
|
||||||
|
options.add_argument('--headless')
|
||||||
|
options.add_argument('--disable-gpu')
|
||||||
|
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver", chrome_options=options)
|
||||||
|
|
||||||
|
browser.delete_all_cookies()
|
||||||
|
browser.get("https://en.antaranews.com/")
|
||||||
|
|
||||||
|
browser.implicitly_wait(5)
|
||||||
|
|
||||||
|
berita1 = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[1]/header/h3/a").get_attribute('textContent')
|
||||||
|
berita2 = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[2]/header/h3/a").get_attribute('textContent')
|
||||||
|
berita3 = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[3]/header/h3/a").get_attribute('textContent')
|
||||||
|
berita4 = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[4]/header/h3/a").get_attribute('textContent')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#Quick copy
|
||||||
|
#browser.find_element_by_xpath("").get_attribute('')
|
||||||
|
|
||||||
|
news = {
|
||||||
|
"server_update": dt_string,
|
||||||
|
"news": [
|
||||||
|
{
|
||||||
|
"judul": berita1,
|
||||||
|
"berita": "",
|
||||||
|
"URL": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[1]/header/h3/a").get_attribute('href'),
|
||||||
|
"gambar": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[1]/div/a/picture/img").get_attribute('src'),
|
||||||
|
"tgl": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[1]/header/p/span").get_attribute('textContent')
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": berita2,
|
||||||
|
"berita": "",
|
||||||
|
"URL": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[2]/header/h3/a").get_attribute('href'),
|
||||||
|
"gambar": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[2]/div/a/picture/img").get_attribute('src'),
|
||||||
|
"tgl": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[2]/header/p/span").get_attribute('textContent')
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": berita3,
|
||||||
|
"berita": "",
|
||||||
|
"URL": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[3]/header/h3/a").get_attribute('href'),
|
||||||
|
"gambar": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[3]/div/a/picture/img").get_attribute('src'),
|
||||||
|
"tgl": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[3]/header/p/span").get_attribute('textContent')
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"judul": berita4,
|
||||||
|
"berita": "",
|
||||||
|
"URL": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[4]/header/h3/a").get_attribute('href'),
|
||||||
|
"gambar": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[4]/div/a/picture/img").get_attribute('src'),
|
||||||
|
"tgl": browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[3]/div[1]/section[1]/div/div[2]/div/div/div[1]/article[4]/header/p/span").get_attribute('textContent')
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
news = json.dumps(news)
|
||||||
|
|
||||||
|
print(news)
|
||||||
|
news = urllib.parse.quote_plus(news)
|
||||||
|
|
||||||
|
response = requests.get('https://api.gabrielkheisa.xyz/news/antaranews/index.php?key='+ key +'&news='+ news)
|
||||||
|
|
||||||
|
browser.quit()
|
||||||
|
print("Sleep for 1 hour")
|
||||||
|
time.sleep(60*60)
|
||||||
|
except:
|
||||||
|
print("Error gak jelas, skip")
|
||||||
|
browser.quit()
|
||||||
|
print("Sleep for 1 hour")
|
||||||
|
time.sleep(60*60)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user