fix: Make sure sitemap respects robot_txt

This commit is contained in:
Suraj Shetty 2024-02-05 15:41:05 +05:30
parent d1f308aad3
commit 2bf82e80ba
3 changed files with 16 additions and 9 deletions

View file

@ -39,7 +39,6 @@ global_cache_keys = (
"domain_restricted_doctypes",
"domain_restricted_pages",
"information_schema:counts",
"sitemap_routes",
"db_tables",
"server_script_autocompletion_items",
) + doctype_map_keys

View file

@ -367,7 +367,6 @@ def clear_cache(path=None):
"website_generator_routes",
"website_pages",
"website_full_index",
"sitemap_routes",
"languages_with_name",
"languages",
):

View file

@ -1,12 +1,14 @@
# Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors
# License: MIT. See LICENSE
from urllib import robotparser
from urllib.parse import quote
import frappe
from frappe.model.document import get_controller
from frappe.utils import get_url, nowdate
from frappe.website.router import get_pages
from frappe.utils.caching import redis_cache
from frappe.website.router import get_doctypes_with_web_view, get_pages
no_cache = 1
base_template_path = "www/sitemap.xml"
@ -31,20 +33,24 @@ def get_context(context):
return {"links": links}
@redis_cache()
def get_public_pages_from_doctypes():
"""Return pages from doctypes that are publicly accessible."""
def get_sitemap_routes():
routes = {}
doctypes_with_web_view = frappe.get_all(
"DocType",
filters={"has_web_view": True, "allow_guest_to_view": True},
pluck="name",
)
doctypes_with_web_view = get_doctypes_with_web_view()
rp = None
if robots_txt := frappe.db.get_single_value("Website Settings", "robots_txt"):
rp = robotparser.RobotFileParser()
rp.parse(robots_txt.splitlines())
for doctype in doctypes_with_web_view:
controller = get_controller(doctype)
meta = frappe.get_meta(doctype)
if not meta.allow_guest_to_view:
continue
condition_field = meta.is_published_field or controller.website.condition_field
if not condition_field:
@ -61,6 +67,9 @@ def get_public_pages_from_doctypes():
raise e
for r in res:
if rp and not rp.can_fetch("*", f"/{r.route}"):
continue
routes[r.route] = {
"doctype": doctype,
"name": r.name,
@ -69,4 +78,4 @@ def get_public_pages_from_doctypes():
return routes
return frappe.cache.get_value("sitemap_routes", get_sitemap_routes)
return get_sitemap_routes()