From bc8fd3294ff7bf9fbb41b2ae0ace135682dcfb71 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Mon, 25 Aug 2025 20:45:29 +0300 Subject: [PATCH] feat: add final_url property to article final_url stores the actual url used to fetch the html after redirects and meta refresh --- newspaper/article.py | 16 +++++++++++++++- newspaper/network.py | 10 ++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/newspaper/article.py b/newspaper/article.py index dd3a25e..4a1b6ad 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -153,6 +153,9 @@ class Article(object): # A property dict for users to store custom data. self.additional_data = {} + # The final URL after redirects and meta refresh + self.final_url = None + def build(self): """Build a lone article from a URL independent of the source (newspaper). Don't normally call this method b/c it's good to multithread articles @@ -173,7 +176,9 @@ class Article(object): def _parse_scheme_http(self): try: - return network.get_html_2XX_only(self.url, self.config) + html, final_url = network.get_html_2XX_only(self.url, self.config, return_final_url=True) + self.final_url = final_url + return html except requests.exceptions.RequestException as e: self.download_state = ArticleDownloadState.FAILED_RESPONSE self.download_exception_msg = str(e) @@ -190,18 +195,27 @@ class Article(object): parsed_url = urlparse(self.url) if parsed_url.scheme == "file": html = self._parse_scheme_file(parsed_url.path) + # For file scheme, the final URL is the same as the initial URL + if self.final_url is None: + self.final_url = self.url else: html = self._parse_scheme_http() + # final_url is already set in _parse_scheme_http if html is None: log.debug('Download failed on URL %s because of %s' % (self.url, self.download_exception_msg)) return else: html = input_html + # If HTML is provided directly and final_url not set, use the current URL + if self.final_url is None: + self.final_url = self.url if self.config.follow_meta_refresh: meta_refresh_url = extract_meta_refresh(html) if meta_refresh_url and recursion_counter < 1: + # Update final_url to the meta refresh URL + self.final_url = meta_refresh_url return self.download( input_html=network.get_html(meta_refresh_url), recursion_counter=recursion_counter + 1) diff --git a/newspaper/network.py b/newspaper/network.py index 8ba02aa..3fb448f 100644 --- a/newspaper/network.py +++ b/newspaper/network.py @@ -44,7 +44,7 @@ def get_html(url, config=None, response=None): return '' -def get_html_2XX_only(url, config=None, response=None): +def get_html_2XX_only(url, config=None, response=None, return_final_url=False): """Consolidated logic for http requests from newspaper. We handle error cases: - Attempt to find encoding of the html by using HTTP header. Fallback to 'ISO-8859-1' if not provided. @@ -58,17 +58,23 @@ def get_html_2XX_only(url, config=None, response=None): allow_redirects = config.allow_redirects if response is not None: - return _get_html_from_response(response, config) + html = _get_html_from_response(response, config) + if return_final_url: + return html, getattr(response, 'url', url) + return html response = requests.get( url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects)) html = _get_html_from_response(response, config) + final_url = response.url if config.http_success_only: # fail if HTTP sends a non 2XX response response.raise_for_status() + if return_final_url: + return html, final_url return html