# Copyright (c) 2021, Frappe Technologies Pvt. Ltd. and Contributors # License: MIT. See LICENSE import json import os import re import redis import frappe from frappe.model.base_document import get_controller from frappe.utils import cint, strip_html_tags from frappe.utils.data import cstr from frappe.utils.html_utils import unescape_html HTML_TAGS_PATTERN = re.compile(r"(?s)<[\s]*(script|style).*?") def setup_global_search_table(): """ Creates __global_search table :return: """ frappe.db.create_global_search_table() def reset(): """ Deletes all data in __global_search :return: """ frappe.db.delete("__global_search") def get_doctypes_with_global_search(with_child_tables=True): """ Return doctypes with global search fields :param with_child_tables: :return: """ def _get(): global_search_doctypes = [] filters = {} if not with_child_tables: filters = {"istable": ["!=", 1], "issingle": ["!=", 1]} for d in frappe.get_all("DocType", fields=["name", "module"], filters=filters): meta = frappe.get_meta(d.name) if len(meta.get_global_search_fields()) > 0: global_search_doctypes.append(d) installed_apps = frappe.get_installed_apps() module_app = frappe.local.module_app doctypes = [ d.name for d in global_search_doctypes if module_app.get(frappe.scrub(d.module)) and module_app[frappe.scrub(d.module)] in installed_apps ] return doctypes return frappe.cache.get_value("doctypes_with_global_search", _get) def rebuild_for_doctype(doctype): """ Rebuild entries of doctype's documents in __global_search on change of searchable fields :param doctype: Doctype """ if frappe.local.conf.get("disable_global_search"): return def _get_filters(): filters = frappe._dict({"docstatus": ["!=", 2]}) if meta.has_field("enabled"): filters.enabled = 1 if meta.has_field("disabled"): filters.disabled = 0 return filters meta = frappe.get_meta(doctype) if cint(meta.issingle) == 1: return if cint(meta.istable) == 1: parent_doctypes = frappe.get_all( "DocField", fields="parent", filters={"fieldtype": ["in", frappe.model.table_fields], "options": doctype}, ) for p in parent_doctypes: rebuild_for_doctype(p.parent) return # Delete records delete_global_search_records_for_doctype(doctype) parent_search_fields = meta.get_global_search_fields() fieldnames = get_selected_fields(meta, parent_search_fields) # Get all records from parent doctype table all_records = frappe.get_all(doctype, fields=fieldnames, filters=_get_filters()) # Children data all_children, child_search_fields = get_children_data(doctype, meta) all_contents = [] for doc in all_records: content = [] for field in parent_search_fields: value = doc.get(field.fieldname) if value: content.append(get_formatted_value(value, field)) # get children data for child_doctype, records in all_children.get(doc.name, {}).items(): for field in child_search_fields.get(child_doctype): for r in records: if r.get(field.fieldname): content.append(get_formatted_value(r.get(field.fieldname), field)) if content: # if doctype published in website, push title, route etc. published = 0 title, route = "", "" try: if hasattr(get_controller(doctype), "is_website_published") and meta.allow_guest_to_view: d = frappe.get_doc(doctype, doc.name) published = 1 if d.is_website_published() else 0 title = d.get_title() route = d.get("route") except ImportError: # some doctypes has been deleted via future patch, hence controller does not exists pass all_contents.append( { "doctype": frappe.db.escape(doctype), "name": frappe.db.escape(doc.name), "content": frappe.db.escape(" ||| ".join(content or "")), "published": published, "title": frappe.db.escape((title or "")[: int(frappe.db.VARCHAR_LEN)]), "route": frappe.db.escape((route or "")[: int(frappe.db.VARCHAR_LEN)]), } ) if all_contents: insert_values_for_multiple_docs(all_contents) def delete_global_search_records_for_doctype(doctype): frappe.db.delete("__global_search", {"doctype": doctype}) def get_selected_fields(meta, global_search_fields): fieldnames = [df.fieldname for df in global_search_fields] if meta.istable == 1: fieldnames.append("parent") elif "name" not in fieldnames: fieldnames.append("name") if meta.has_field("is_website_published"): fieldnames.append("is_website_published") return fieldnames def get_children_data(doctype, meta): """ Get all records from all the child tables of a doctype all_children = { "parent1": { "child_doctype1": [ { "field1": val1, "field2": val2 } ] } } """ all_children = frappe._dict() child_search_fields = frappe._dict() for child in meta.get_table_fields(): child_meta = frappe.get_meta(child.options) search_fields = child_meta.get_global_search_fields() if search_fields: child_search_fields.setdefault(child.options, search_fields) child_fieldnames = get_selected_fields(child_meta, search_fields) child_records = frappe.get_all( child.options, fields=child_fieldnames, filters={"docstatus": ["!=", 1], "parenttype": doctype} ) for record in child_records: all_children.setdefault(record.parent, frappe._dict()).setdefault(child.options, []).append( record ) return all_children, child_search_fields def insert_values_for_multiple_docs(all_contents): values = [] for content in all_contents: values.append("({doctype}, {name}, {content}, {published}, {title}, {route})".format(**content)) batch_size = 50000 for i in range(0, len(values), batch_size): batch_values = values[i : i + batch_size] # ignoring duplicate keys for doctype_name frappe.db.multisql( { "mariadb": """INSERT IGNORE INTO `__global_search` (doctype, name, content, published, title, route) VALUES {} """.format( ", ".join(batch_values) ), "postgres": """INSERT INTO `__global_search` (doctype, name, content, published, title, route) VALUES {} ON CONFLICT("name", "doctype") DO NOTHING""".format( ", ".join(batch_values) ), } ) def update_global_search(doc): """ Add values marked with `in_global_search` to `global_search_queue` from given doc :param doc: Document to be added to global search """ if frappe.local.conf.get("disable_global_search"): return if ( doc.docstatus > 1 or (doc.meta.has_field("enabled") and not doc.get("enabled")) or doc.get("disabled") ): return content = [] for field in doc.meta.get_global_search_fields(): if doc.get(field.fieldname) and field.fieldtype not in frappe.model.table_fields: content.append(get_formatted_value(doc.get(field.fieldname), field)) # Get children for child in doc.meta.get_table_fields(): for d in doc.get(child.fieldname): if d.parent == doc.name: for field in d.meta.get_global_search_fields(): if d.get(field.fieldname): content.append(get_formatted_value(d.get(field.fieldname), field)) if content: published = 0 if hasattr(doc, "is_website_published") and doc.meta.allow_guest_to_view: published = 1 if doc.is_website_published() else 0 title = (cstr(doc.get_title()) or "")[: int(frappe.db.VARCHAR_LEN)] route = doc.get("route") if doc else "" value = dict( doctype=doc.doctype, name=doc.name, content=" ||| ".join(content or ""), published=published, title=title, route=route, ) sync_value_in_queue(value) def update_global_search_for_all_web_pages(): if frappe.conf.get("disable_global_search"): return print("Update global search for all web pages...") routes_to_index = get_routes_to_index() for route in routes_to_index: add_route_to_global_search(route) sync_global_search() def get_routes_to_index(): apps = frappe.get_installed_apps() routes_to_index = [] for app in apps: base = frappe.get_app_path(app, "www") path_to_index = frappe.get_app_path(app, "www") for dirpath, _, filenames in os.walk(path_to_index, topdown=True): for f in filenames: if f.endswith((".md", ".html")): filepath = os.path.join(dirpath, f) route = os.path.relpath(filepath, base) route = route.split(".", 1)[0] if route.endswith("index"): route = route.rsplit("index", 1)[0] routes_to_index.append(route) return routes_to_index def add_route_to_global_search(route): from bs4 import BeautifulSoup from frappe.utils import set_request from frappe.website.serve import get_response_content frappe.set_user("Guest") frappe.local.no_cache = True try: set_request(method="GET", path=route) content = get_response_content(route) soup = BeautifulSoup(content, "html.parser") page_content = soup.find(class_="page_content") text_content = page_content.text if page_content else "" title = soup.title.text.strip() if soup.title else route value = dict( doctype="Static Web Page", name=route, content=text_content, published=1, title=title, route=route, ) sync_value_in_queue(value) except Exception: pass frappe.set_user("Administrator") def get_formatted_value(value, field): """ Prepare field from raw data :param value: :param field: :return: """ if getattr(field, "fieldtype", None) in ["Text", "Text Editor"]: value = unescape_html(frappe.safe_decode(value)) value = HTML_TAGS_PATTERN.subn("", str(value))[0] value = " ".join(value.split()) return field.label + " : " + strip_html_tags(str(value)) def sync_global_search(): """ Inserts / updates values from `global_search_queue` to __global_search. This is called via job scheduler :param flags: :return: """ while frappe.cache.llen("global_search_queue") > 0: # rpop to follow FIFO # Last one should override all previous contents of same document value = json.loads(frappe.cache.rpop("global_search_queue").decode("utf-8")) sync_value(value) def sync_value_in_queue(value): try: # append to search queue if connected frappe.cache.lpush("global_search_queue", json.dumps(value)) except redis.exceptions.ConnectionError: # not connected, sync directly sync_value(value) def sync_value(value): """ Sync a given document to global search :param value: dict of { doctype, name, content, published, title, route } """ frappe.db.multisql( { "mariadb": """INSERT INTO `__global_search` (`doctype`, `name`, `content`, `published`, `title`, `route`) VALUES (%(doctype)s, %(name)s, %(content)s, %(published)s, %(title)s, %(route)s) ON DUPLICATE key UPDATE `content`=%(content)s, `published`=%(published)s, `title`=%(title)s, `route`=%(route)s """, "postgres": """INSERT INTO `__global_search` (`doctype`, `name`, `content`, `published`, `title`, `route`) VALUES (%(doctype)s, %(name)s, %(content)s, %(published)s, %(title)s, %(route)s) ON CONFLICT("doctype", "name") DO UPDATE SET `content`=%(content)s, `published`=%(published)s, `title`=%(title)s, `route`=%(route)s """, }, value, ) def delete_for_document(doc): """ Delete the __global_search entry of a document that has been deleted :param doc: Deleted document """ frappe.db.delete("__global_search", {"doctype": doc.doctype, "name": doc.name}) @frappe.whitelist() def search(text, start=0, limit=20, doctype=""): """ Search for given text in __global_search :param text: phrase to be searched :param start: start results at, default 0 :param limit: number of results to return, default 20 :return: Array of result objects """ from frappe.desk.doctype.global_search_settings.global_search_settings import ( get_doctypes_for_global_search, ) from frappe.query_builder.functions import Match results = [] sorted_results = [] allowed_doctypes = get_doctypes_for_global_search() for word in set(text.split("&")): word = word.strip() if not word: continue global_search = frappe.qb.Table("__global_search") rank = Match(global_search.content).Against(word).as_("rank") query = ( frappe.qb.from_(global_search) .select(global_search.doctype, global_search.name, global_search.content, rank) .orderby("rank", order=frappe.qb.desc) .limit(limit) ) if doctype: query = query.where(global_search.doctype == doctype) elif allowed_doctypes: query = query.where(global_search.doctype.isin(allowed_doctypes)) if cint(start) > 0: query = query.offset(start) result = query.run(as_dict=True) results.extend(result) # sort results based on allowed_doctype's priority for doctype in allowed_doctypes: for index, r in enumerate(results): if r.doctype == doctype and r.rank > 0.0: try: meta = frappe.get_meta(r.doctype) if meta.image_field: r.image = frappe.db.get_value(r.doctype, r.name, meta.image_field) except Exception: frappe.clear_messages() sorted_results.extend([r]) return sorted_results @frappe.whitelist(allow_guest=True) def web_search(text: str, scope: str | None = None, start: int = 0, limit: int = 20): """ Search for given text in __global_search where published = 1 :param text: phrase to be searched :param scope: search only in this route, for e.g /docs :param start: start results at, default 0 :param limit: number of results to return, default 20 :return: Array of result objects """ results = [] texts = text.split("&") for text in texts: common_query = """ SELECT `doctype`, `name`, `content`, `title`, `route` FROM `__global_search` WHERE {conditions} LIMIT %(limit)s OFFSET %(start)s""" scope_condition = "`route` like %(scope)s AND " if scope else "" published_condition = "`published` = 1 AND " mariadb_conditions = postgres_conditions = " ".join([published_condition, scope_condition]) # https://mariadb.com/kb/en/library/full-text-index-overview/#in-boolean-mode mariadb_conditions += "MATCH(`content`) AGAINST ({} IN BOOLEAN MODE)".format( frappe.db.escape("+" + text + "*") ) postgres_conditions += 'TO_TSVECTOR("content") @@ PLAINTO_TSQUERY({})'.format( frappe.db.escape(text) ) values = {"scope": "".join([scope, "%"]) if scope else "", "limit": limit, "start": start} result = frappe.db.multisql( { "mariadb": common_query.format(conditions=mariadb_conditions), "postgres": common_query.format(conditions=postgres_conditions), }, values=values, as_dict=True, ) tmp_result = [] for i in result: if i in results or not results: tmp_result.append(i) results += tmp_result # chart of accounts -> {chart, of, accounts} # titles that match the most of these words will have high relevance words = set(get_distinct_words(text)) for r in results: title_words = set(get_distinct_words(r.title)) words_match = len(words.intersection(title_words)) r.relevance = words_match results = sorted(results, key=lambda x: x.relevance, reverse=True) return results def get_distinct_words(text): text = text.replace('"', "") text = text.replace("'", "") return [w.strip().lower() for w in text.split(" ")]