From bc8fd3294ff7bf9fbb41b2ae0ace135682dcfb71 Mon Sep 17 00:00:00 2001
From: Ido Shamun <idoesh1@gmail.com>
Date: Mon, 25 Aug 2025 20:45:29 +0300
Subject: [PATCH] feat: add final_url property to article

final_url stores the actual url used to fetch the html after redirects and meta refresh
---
 newspaper/article.py | 16 +++++++++++++++-
 newspaper/network.py | 10 ++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/newspaper/article.py b/newspaper/article.py
index dd3a25e..4a1b6ad 100644
--- a/newspaper/article.py
+++ b/newspaper/article.py
@@ -153,6 +153,9 @@ class Article(object):
         # A property dict for users to store custom data.
         self.additional_data = {}
 
+        # The final URL after redirects and meta refresh
+        self.final_url = None
+
     def build(self):
         """Build a lone article from a URL independent of the source (newspaper).
         Don't normally call this method b/c it's good to multithread articles
@@ -173,7 +176,9 @@ class Article(object):
 
     def _parse_scheme_http(self):
         try:
-            return network.get_html_2XX_only(self.url, self.config)
+            html, final_url = network.get_html_2XX_only(self.url, self.config, return_final_url=True)
+            self.final_url = final_url
+            return html
         except requests.exceptions.RequestException as e:
             self.download_state = ArticleDownloadState.FAILED_RESPONSE
             self.download_exception_msg = str(e)
@@ -190,18 +195,27 @@ class Article(object):
             parsed_url = urlparse(self.url)
             if parsed_url.scheme == "file":
                 html = self._parse_scheme_file(parsed_url.path)
+                # For file scheme, the final URL is the same as the initial URL
+                if self.final_url is None:
+                    self.final_url = self.url
             else:
                 html = self._parse_scheme_http()
+                # final_url is already set in _parse_scheme_http
             if html is None:
                 log.debug('Download failed on URL %s because of %s' %
                           (self.url, self.download_exception_msg))
                 return
         else:
             html = input_html
+            # If HTML is provided directly and final_url not set, use the current URL
+            if self.final_url is None:
+                self.final_url = self.url
 
         if self.config.follow_meta_refresh:
             meta_refresh_url = extract_meta_refresh(html)
             if meta_refresh_url and recursion_counter < 1:
+                # Update final_url to the meta refresh URL
+                self.final_url = meta_refresh_url
                 return self.download(
                     input_html=network.get_html(meta_refresh_url),
                     recursion_counter=recursion_counter + 1)
diff --git a/newspaper/network.py b/newspaper/network.py
index 8ba02aa..3fb448f 100644
--- a/newspaper/network.py
+++ b/newspaper/network.py
@@ -44,7 +44,7 @@ def get_html(url, config=None, response=None):
         return ''
 
 
-def get_html_2XX_only(url, config=None, response=None):
+def get_html_2XX_only(url, config=None, response=None, return_final_url=False):
     """Consolidated logic for http requests from newspaper. We handle error cases:
     - Attempt to find encoding of the html by using HTTP header. Fallback to
       'ISO-8859-1' if not provided.
@@ -58,17 +58,23 @@ def get_html_2XX_only(url, config=None, response=None):
     allow_redirects = config.allow_redirects
 
     if response is not None:
-        return _get_html_from_response(response, config)
+        html = _get_html_from_response(response, config)
+        if return_final_url:
+            return html, getattr(response, 'url', url)
+        return html
 
     response = requests.get(
         url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects))
 
     html = _get_html_from_response(response, config)
+    final_url = response.url
 
     if config.http_success_only:
         # fail if HTTP sends a non 2XX response
         response.raise_for_status()
 
+    if return_final_url:
+        return html, final_url
     return html