From 2bf82e80ba857793ae3cf54c6ed3b16cbbb6b50f Mon Sep 17 00:00:00 2001 From: Suraj Shetty Date: Mon, 5 Feb 2024 15:41:05 +0530 Subject: [PATCH] fix: Make sure sitemap respects robot_txt --- frappe/cache_manager.py | 1 - frappe/website/utils.py | 1 - frappe/www/sitemap.py | 23 ++++++++++++++++------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/frappe/cache_manager.py b/frappe/cache_manager.py index 11302820c5..769be78709 100644 --- a/frappe/cache_manager.py +++ b/frappe/cache_manager.py @@ -39,7 +39,6 @@ global_cache_keys = ( "domain_restricted_doctypes", "domain_restricted_pages", "information_schema:counts", - "sitemap_routes", "db_tables", "server_script_autocompletion_items", ) + doctype_map_keys diff --git a/frappe/website/utils.py b/frappe/website/utils.py index 568d11777d..d92e4f033f 100644 --- a/frappe/website/utils.py +++ b/frappe/website/utils.py @@ -367,7 +367,6 @@ def clear_cache(path=None): "website_generator_routes", "website_pages", "website_full_index", - "sitemap_routes", "languages_with_name", "languages", ): diff --git a/frappe/www/sitemap.py b/frappe/www/sitemap.py index ebe6846a39..faff15c693 100644 --- a/frappe/www/sitemap.py +++ b/frappe/www/sitemap.py @@ -1,12 +1,14 @@ # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors # License: MIT. See LICENSE +from urllib import robotparser from urllib.parse import quote import frappe from frappe.model.document import get_controller from frappe.utils import get_url, nowdate -from frappe.website.router import get_pages +from frappe.utils.caching import redis_cache +from frappe.website.router import get_doctypes_with_web_view, get_pages no_cache = 1 base_template_path = "www/sitemap.xml" @@ -31,20 +33,24 @@ def get_context(context): return {"links": links} +@redis_cache() def get_public_pages_from_doctypes(): """Return pages from doctypes that are publicly accessible.""" def get_sitemap_routes(): routes = {} - doctypes_with_web_view = frappe.get_all( - "DocType", - filters={"has_web_view": True, "allow_guest_to_view": True}, - pluck="name", - ) + doctypes_with_web_view = get_doctypes_with_web_view() + rp = None + if robots_txt := frappe.db.get_single_value("Website Settings", "robots_txt"): + rp = robotparser.RobotFileParser() + rp.parse(robots_txt.splitlines()) for doctype in doctypes_with_web_view: controller = get_controller(doctype) meta = frappe.get_meta(doctype) + if not meta.allow_guest_to_view: + continue + condition_field = meta.is_published_field or controller.website.condition_field if not condition_field: @@ -61,6 +67,9 @@ def get_public_pages_from_doctypes(): raise e for r in res: + if rp and not rp.can_fetch("*", f"/{r.route}"): + continue + routes[r.route] = { "doctype": doctype, "name": r.name, @@ -69,4 +78,4 @@ def get_public_pages_from_doctypes(): return routes - return frappe.cache.get_value("sitemap_routes", get_sitemap_routes) + return get_sitemap_routes()