여기서 뉴스 열개 스크랩해서 오려고 했는데
스크랩하니
https://news.google.com/read/CBMiXkFVX3lxTE44Z3FzY2NtbEZhVFdjR0ZrU1NBRFU2TUZPbGg1Rmx6aTNkOXpUQ0VpV3hVOEdFRVVfUHdjcnhkTEY1bUVSM3hkdjBHSURjOWwtSHdtZ3BScWtsdmgtMVHSAXZBVV95cUxNNGM3bXRQOVhkaUJhRDQ0NDRuTXR5TTRBT0FjZ1c0V0VZMFhqRzFwWEpJU3piRU9HeXV1YThGSS11V19qX0p1SGpHLTZHM1Q1TDhRRTY0bFVFV1F3TzZEQkhBYU9PUHdlSW16dDhMOUdINlQtRm9R?hl=ko&gl=KR&ceid=KR%3Ako
링크
이런식으로 redirect로 되어있어서
막혔습니다
이거 우회하려고
r = requests.get('https://youtu.be/dQw4w9WgXcQ')
print(r.url) # https://www.youtube.com/watch?v=dQw4w9WgXcQ&feature=youtu.be
이거도 안되고
from selenium import webdriver
def get_final_link_js(url):
# Requires selenium and a suitable driver, e.g. ChromeDriver
driver = webdriver.Chrome()
try:
driver.get(url)
# Wait or drive the page as needed
final_url = driver.current_url
finally:
driver.quit()
return final_url
이방법은 API Function 스크립트로 안되고 인스턴스 만든다음 해야할거 같아서 일단 보류
import urllib.request
import re
import urllib.parse
def extract_read_links(url, limit=10):
"""
Fetch the HTML from a Google News URL, find links that start with "./read/",
convert them into full URLs (https://news.google.com/read/...), and return
up to `limit` such links.
"""
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req) as response:
html_content = response.read().decode("utf-8", errors="ignore")
# Regex to capture: href="./read/..."
pattern = re.compile(r'href="\./read/[^"]+"')
matches = pattern.findall(html_content)
results = []
for match in matches:
# match looks like: href="./read/CBMiSkF..."
relative_url = match[6:-1] # remove href=" and trailing quote
# "./read/..." => "https://news.google.com/read/..."
full_url = "https://news.google.com/" + relative_url[2:]
results.append(full_url)
if len(results) == limit:
break
return results
def get_final_destination(url):
"""
Follow HTTP 3xx redirects automatically.
If the final URL never changes, look for a <meta http-equiv="refresh"> tag
in the HTML. If there's no server-side 3xx redirect or meta-refresh,
return the original URL.
"""
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req) as resp:
resolved_url = resp.geturl()
if resolved_url != url:
# Found a standard 3xx redirect
return resolved_url
html = resp.read().decode("utf-8", errors="ignore")
# Check for <meta http-equiv="refresh" content="0; url=...">
match = re.search(
r'<meta[^>]+http-equiv\s*=\s*"refresh"[^>]+url\s*=\s*([^";]+)',
html,
flags=re.IGNORECASE
)
if match:
meta_url = match.group(1).strip()
return urllib.parse.urljoin(url, meta_url)
return url
def main():
# Example Google News URL (Bellevue/Seattle news, etc.)
google_news_url = (
"https://news.google.com/topics/"
"CAAqHAgKIhZDQklTQ2pvSWJHOWpZV3hmZGpJb0FBUAE/"
"sections/CAQiTkNCSVNORG9JYkc5allXeGZkakpDRUd4dlkyRnNYM1l5"
"WDNObFkzUnBiMjV5Q2hJSUwyMHZNR1E1YW5KNkNnb0lMMjB2TUdRNWFuSW9BQS"
"owCAAqLAgKIiZDQklTRmpvSWJHOWpZV3hmZGpKNkNnb0lMMjB2TUdRNWFuSW9BQVABUAE"
"?hl=ko&gl=KR&ceid=KR%3Ako"
)
# Step 1: Extract up to 10 ./read/... links
read_links = extract_read_links(google_news_url, limit=10)
if not read_links:
print("No ./read/ links found.")
return
# Step 2: For each extracted link, find its ultimate final destination
for i, link in enumerate(read_links, start=1):
final_url = get_final_destination(link)
print(f"{i}. Extracted: {link}\n Final: {final_url}\n")
if __name__ == "__main__":
main()


