# Copyright (c) 2020, Frappe Technologies Pvt. Ltd. and Contributors # License: MIT. See LICENSE import os from bs4 import BeautifulSoup from whoosh.fields import ID, TEXT, Schema import frappe from frappe.search.full_text_search import FullTextSearch from frappe.utils import set_request, update_progress_bar from frappe.website.serve import get_response_content INDEX_NAME = "web_routes" class WebsiteSearch(FullTextSearch): """Wrapper for WebsiteSearch""" def get_schema(self): return Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True)) def get_fields_to_search(self): return ["title", "content"] def get_id(self): return "path" def get_items_to_index(self): """Get all routes to be indexed, this includes the static pages in www/ and routes from published documents Returns: self (object): FullTextSearch Instance """ if getattr(self, "_items_to_index", False): return self._items_to_index self._items_to_index = [] routes = get_static_pages_from_all_apps() + slugs_with_web_view(self._items_to_index) for i, route in enumerate(routes): update_progress_bar("Retrieving Routes", i, len(routes)) self._items_to_index += [self.get_document_to_index(route)] print() return self.get_items_to_index() def get_document_to_index(self, route): """Render a page and parse it using BeautifulSoup Args: path (str): route of the page to be parsed Returns: document (_dict): A dictionary with title, path and content """ frappe.set_user("Guest") frappe.local.no_cache = True try: set_request(method="GET", path=route) content = get_response_content(route) soup = BeautifulSoup(content, "html.parser") page_content = soup.find(class_="page_content") text_content = page_content.text if page_content else "" title = soup.title.text.strip() if soup.title else route return frappe._dict(title=title, content=text_content, path=route) except Exception: pass finally: frappe.set_user("Administrator") def parse_result(self, result): title_highlights = result.highlights("title") content_highlights = result.highlights("content") return frappe._dict( title=result["title"], path=result["path"], title_highlights=title_highlights, content_highlights=content_highlights, ) def slugs_with_web_view(_items_to_index): all_routes = [] filters = {"has_web_view": 1, "allow_guest_to_view": 1, "index_web_pages_for_search": 1} fields = ["name", "is_published_field", "website_search_field"] doctype_with_web_views = frappe.get_all("DocType", filters=filters, fields=fields) for doctype in doctype_with_web_views: if doctype.is_published_field: fields = ["route", doctype.website_search_field] filters = ({doctype.is_published_field: 1},) if doctype.website_search_field: docs = frappe.get_all(doctype.name, filters=filters, fields=fields + ["title"]) for doc in docs: content = frappe.utils.md_to_html(getattr(doc, doctype.website_search_field)) soup = BeautifulSoup(content, "html.parser") text_content = soup.text if soup else "" _items_to_index += [frappe._dict(title=doc.title, content=text_content, path=doc.route)] else: docs = frappe.get_all(doctype.name, filters=filters, fields=fields) all_routes += [route.route for route in docs] return all_routes def get_static_pages_from_all_apps(): from glob import glob apps = frappe.get_installed_apps() routes_to_index = [] for app in apps: path_to_index = frappe.get_app_path(app, "www") files_to_index = glob(path_to_index + "/**/*.html", recursive=True) files_to_index.extend(glob(path_to_index + "/**/*.md", recursive=True)) for file in files_to_index: route = os.path.relpath(file, path_to_index).split(".", maxsplit=1)[0] if route.endswith("index"): route = route.rsplit("index", 1)[0] routes_to_index.append(route) return routes_to_index def update_index_for_path(path): ws = WebsiteSearch(INDEX_NAME) return ws.update_index_by_name(path) def remove_document_from_index(path): ws = WebsiteSearch(INDEX_NAME) return ws.remove_document_from_index(path) def build_index_for_all_routes(): from frappe.utils.synchronization import filelock with filelock("building_website_search"): ws = WebsiteSearch(INDEX_NAME) return ws.build()