diff --git a/frappe/commands/site.py b/frappe/commands/site.py index 26eb455338..b72d98c433 100755 --- a/frappe/commands/site.py +++ b/frappe/commands/site.py @@ -274,8 +274,9 @@ def disable_user(context, email): @click.command('migrate') @click.option('--rebuild-website', help="Rebuild webpages after migration") @click.option('--skip-failing', is_flag=True, help="Skip patches that fail to run") +@click.option('--skip-search-index', is_flag=True, help="Skip search indexing for web documents") @pass_context -def migrate(context, rebuild_website=False, skip_failing=False): +def migrate(context, rebuild_website=False, skip_failing=False, skip_search_index=False): "Run patches, sync schema and rebuild files/translations" from frappe.migrate import migrate @@ -284,7 +285,12 @@ def migrate(context, rebuild_website=False, skip_failing=False): frappe.init(site=site) frappe.connect() try: - migrate(context.verbose, rebuild_website=rebuild_website, skip_failing=skip_failing) + migrate( + context.verbose, + rebuild_website=rebuild_website, + skip_failing=skip_failing, + skip_search_index=skip_search_index + ) finally: frappe.destroy() if not context.sites: @@ -655,6 +661,22 @@ def start_ngrok(context): frappe.destroy() ngrok.kill() +@click.command('build-search-index') +@pass_context +def build_search_index(context): + from frappe.search.website_search import build_index_for_all_routes + site = get_site(context) + if not site: + raise SiteNotSpecifiedError + + print('Building search index for {}'.format(site)) + frappe.init(site=site) + frappe.connect() + try: + build_index_for_all_routes() + finally: + frappe.destroy() + commands = [ add_system_manager, backup, @@ -680,5 +702,6 @@ commands = [ start_recording, stop_recording, add_to_hosts, - start_ngrok + start_ngrok, + build_search_index ] diff --git a/frappe/core/doctype/doctype/doctype.json b/frappe/core/doctype/doctype/doctype.json index 379ea227cb..698289140e 100644 --- a/frappe/core/doctype/doctype/doctype.json +++ b/frappe/core/doctype/doctype/doctype.json @@ -70,6 +70,7 @@ "web_view", "has_web_view", "allow_guest_to_view", + "index_web_pages_for_search", "route", "is_published_field", "advanced", @@ -517,12 +518,18 @@ "fieldname": "email_settings_sb", "fieldtype": "Section Break", "label": "Email Settings" + }, + { + "default": "1", + "fieldname": "index_web_pages_for_search", + "fieldtype": "Check", + "label": "Index Web Pages for Search" } ], "icon": "fa fa-bolt", "idx": 6, "links": [], - "modified": "2020-03-27 14:51:44.581128", + "modified": "2020-07-21 16:20:57.028802", "modified_by": "Administrator", "module": "Core", "name": "DocType", diff --git a/frappe/email/doctype/newsletter/newsletter.json b/frappe/email/doctype/newsletter/newsletter.json index 01f75be954..1ec64826da 100644 --- a/frappe/email/doctype/newsletter/newsletter.json +++ b/frappe/email/doctype/newsletter/newsletter.json @@ -132,10 +132,11 @@ "has_web_view": 1, "icon": "fa fa-envelope", "idx": 1, + "index_web_pages_for_search": 1, "is_published_field": "published", "links": [], "max_attachments": 3, - "modified": "2020-05-12 18:09:40.137138", + "modified": "2020-07-21 16:25:17.687476", "modified_by": "Administrator", "module": "Email", "name": "Newsletter", diff --git a/frappe/hooks.py b/frappe/hooks.py index 1f209f00a2..894e72a121 100644 --- a/frappe/hooks.py +++ b/frappe/hooks.py @@ -272,9 +272,6 @@ setup_wizard_exception = [ ] before_migrate = ['frappe.patches.v11_0.sync_user_permission_doctype_before_migrate.execute'] -after_migrate = [ - 'frappe.modules.full_text_search.build_index_for_all_routes' -] otp_methods = ['OTP App','Email','SMS'] user_privacy_documents = [ diff --git a/frappe/migrate.py b/frappe/migrate.py index 9ec23d8ae7..6d64799fdd 100644 --- a/frappe/migrate.py +++ b/frappe/migrate.py @@ -19,10 +19,10 @@ from frappe.website import render from frappe.core.doctype.language.language import sync_languages from frappe.modules.utils import sync_customizations from frappe.core.doctype.scheduled_job_type.scheduled_job_type import sync_jobs -from frappe.utils import global_search +from frappe.search.website_search import build_index_for_all_routes -def migrate(verbose=True, rebuild_website=False, skip_failing=False): +def migrate(verbose=True, rebuild_website=False, skip_failing=False, skip_search_index=False): '''Migrate all apps to the latest version, will: - run before migrate hooks - run patches @@ -80,9 +80,6 @@ Otherwise, check the server logs and ensure that all the required services are r # syncs statics render.clear_cache() - # add static pages to global search - global_search.update_global_search_for_all_web_pages() - # updating installed applications data frappe.get_single('Installed Applications').update_versions() @@ -91,6 +88,12 @@ Otherwise, check the server logs and ensure that all the required services are r for fn in frappe.get_hooks('after_migrate', app_name=app): frappe.get_attr(fn)() + # build web_routes index + if not skip_search_index: + # Run this last as it updates the current session + print('Building search index for {}'.format(frappe.local.site)) + build_index_for_all_routes() + frappe.db.commit() clear_notifications() diff --git a/frappe/modules/full_text_search.py b/frappe/modules/full_text_search.py deleted file mode 100644 index fce9983907..0000000000 --- a/frappe/modules/full_text_search.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020, Frappe Technologies Pvt. Ltd. and Contributors -# MIT License. See license.txt - -from __future__ import unicode_literals -import frappe -from whoosh.index import create_in, open_dir -from whoosh.fields import TEXT, ID, Schema -from whoosh.qparser import MultifieldParser, FieldsPlugin, WildcardPlugin -from whoosh.query import Prefix -from bs4 import BeautifulSoup -from frappe.website.render import render_page -from frappe.utils import set_request, cint -from frappe.utils.global_search import get_routes_to_index - - -def build_index_for_all_routes(): - print("Building search index for all web routes...") - routes = get_routes_to_index() - documents = [get_document_to_index(route) for route in routes] - build_index("web_routes", documents) - - -@frappe.whitelist(allow_guest=True) -def web_search(index_name, query, scope=None, limit=20): - limit = cint(limit) - return search(index_name, query, scope, limit) - - -def get_document_to_index(route): - frappe.set_user("Guest") - frappe.local.no_cache = True - - try: - set_request(method="GET", path=route) - content = render_page(route) - soup = BeautifulSoup(content, "html.parser") - page_content = soup.find(class_="page_content") - text_content = page_content.text if page_content else "" - title = soup.title.text.strip() if soup.title else route - - frappe.set_user("Administrator") - - return frappe._dict(title=title, content=text_content, path=route) - except ( - frappe.PermissionError, - frappe.DoesNotExistError, - frappe.ValidationError, - Exception, - ): - pass - - -def build_index(index_name, documents): - schema = Schema( - title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True) - ) - - index_dir = get_index_path(index_name) - frappe.create_folder(index_dir) - - ix = create_in(index_dir, schema) - writer = ix.writer() - - for document in documents: - if document: - writer.add_document( - title=document.title, path=document.path, content=document.content - ) - - writer.commit() - - -def search(index_name, text, scope=None, limit=20): - index_dir = get_index_path(index_name) - ix = open_dir(index_dir) - - results = None - out = [] - with ix.searcher() as searcher: - parser = MultifieldParser(["title", "content"], ix.schema) - parser.remove_plugin_class(FieldsPlugin) - parser.remove_plugin_class(WildcardPlugin) - query = parser.parse(text) - - filter_scoped = None - if scope: - filter_scoped = Prefix("path", scope) - results = searcher.search(query, limit=limit, filter=filter_scoped) - - for r in results: - title_highlights = r.highlights("title") - content_highlights = r.highlights("content") - out.append( - frappe._dict( - title=r["title"], - path=r["path"], - title_highlights=title_highlights, - content_highlights=content_highlights, - ) - ) - - return out - - -def get_index_path(index_name): - return frappe.get_site_path("indexes", index_name) diff --git a/frappe/public/scss/doc.scss b/frappe/public/scss/doc.scss index 13a59ba45b..f258e2ee47 100644 --- a/frappe/public/scss/doc.scss +++ b/frappe/public/scss/doc.scss @@ -81,45 +81,10 @@ $navbar-height-lg: 4.5rem; } .doc-search { - position: relative; - width: 100%; - @include media-breakpoint-up(lg) { padding-left: 4rem; padding-right: 4rem; } - - .search-icon { - position: absolute; - left: 0; - top: 0; - width: 2.5rem; - height: 100%; - display: flex; - justify-content: center; - align-items: center; - } - - svg { - color: $gray-600; - } - - input { - padding-left: 2.5rem; - } - - .dropdown-menu { - .dropdown-item { - padding: 1rem 0.75rem; - } - - .match { - background-color: $primary-light; - color: $primary; - font-weight: 500; - padding: 0 0.125rem; - } - } } .doc-sidebar { diff --git a/frappe/public/scss/search.scss b/frappe/public/scss/search.scss new file mode 100644 index 0000000000..aee2457f31 --- /dev/null +++ b/frappe/public/scss/search.scss @@ -0,0 +1,40 @@ +.website-search { + position: relative; + width: 100%; + + .search-icon { + position: absolute; + left: 0; + top: 0; + width: 2.5rem; + height: 100%; + display: flex; + justify-content: center; + align-items: center; + } + + svg { + color: $gray-600; + } + + input { + padding-left: 2.5rem; + } + + .dropdown-menu { + .dropdown-item { + padding: 1rem 0.75rem; + + &:focus { + background-color: $gray-100; + } + } + + .match { + background-color: $primary-light; + color: $primary; + font-weight: 500; + padding: 0 0.125rem; + } + } +} \ No newline at end of file diff --git a/frappe/public/scss/website.scss b/frappe/public/scss/website.scss index 3c11d23252..e64c090ea8 100644 --- a/frappe/public/scss/website.scss +++ b/frappe/public/scss/website.scss @@ -10,6 +10,7 @@ @import 'markdown'; @import 'sidebar'; @import 'portal'; +@import 'search'; @import 'doc'; .ql-editor.read-mode { diff --git a/frappe/search/__init__.py b/frappe/search/__init__.py new file mode 100644 index 0000000000..0436775417 --- /dev/null +++ b/frappe/search/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, Frappe Technologies Pvt. Ltd. and Contributors +# MIT License. See license.txt + +import frappe +from frappe.utils import cint +from frappe.search.website_search import WebsiteSearch +from frappe.search.full_text_search import FullTextSearch + +@frappe.whitelist(allow_guest=True) +def web_search(query, scope=None, limit=20): + limit = cint(limit) + ws = WebsiteSearch(index_name="web_routes") + return ws.search(query, scope, limit) \ No newline at end of file diff --git a/frappe/search/full_text_search.py b/frappe/search/full_text_search.py new file mode 100644 index 0000000000..dd6e69111d --- /dev/null +++ b/frappe/search/full_text_search.py @@ -0,0 +1,136 @@ +# Copyright (c) 2020, Frappe Technologies Pvt. Ltd. and Contributors +# MIT License. See license.txt + +from __future__ import unicode_literals +import frappe + +from whoosh.index import create_in, open_dir, EmptyIndexError +from whoosh.fields import TEXT, ID, Schema +from whoosh.qparser import MultifieldParser, FieldsPlugin, WildcardPlugin +from whoosh.query import Prefix + +class FullTextSearch: + """ Frappe Wrapper for Whoosh """ + + def __init__(self, index_name): + self.index_name = index_name + self.index_path = get_index_path(index_name) + self.schema = self.get_schema() + self.id = self.get_id() + + def get_schema(self): + return Schema(name=ID(stored=True), content=TEXT(stored=True)) + + def get_id(self): + return "name" + + def get_items_to_index(self): + """Get all documents to be indexed conforming to the schema""" + return [] + + def get_document_to_index(self): + return {} + + def build(self): + """ Build search index for all documents """ + self.documents = self.get_items_to_index() + self.build_index() + + def update_index_by_name(self, doc_name): + """Wraps `update_index` method, gets the document from name + and updates the index. This function changes the current user + and should only be run as administrator or in a background job. + + Args: + self (object): FullTextSearch Instance + doc_name (str): name of the document to be updated + """ + document = self.get_document_to_index(doc_name) + self.update_index(document) + + def remove_document_from_index(self, doc_name): + """Remove document from search index + + Args: + self (object): FullTextSearch Instance + doc_name (str): name of the document to be removed + """ + if not doc_name: + return + + ix = self.get_index() + with ix.searcher(): + writer = ix.writer() + writer.delete_by_term(self.id, doc_name) + writer.commit(optimize=True) + + def update_index(self, document): + """Update search index for a document + + Args: + self (object): FullTextSearch Instance + document (_dict): A dictionary with title, path and content + """ + ix = self.get_index() + + with ix.searcher(): + writer = ix.writer() + writer.delete_by_term(self.id, document[self.id]) + writer.add_document(**document) + writer.commit(optimize=True) + + def get_index(self): + try: + return open_dir(self.index_path) + except EmptyIndexError: + return self.create_index() + + def create_index(self): + frappe.create_folder(self.index_path) + return create_in(self.index_path, self.schema) + + def build_index(self): + """Build index for all parsed documents""" + ix = self.create_index() + writer = ix.writer() + + for document in self.documents: + if document: + writer.add_document(**document) + + writer.commit(optimize=True) + + def search(self, text, scope=None, limit=20): + """Search from the current index + + Args: + text (str): String to search for + scope (str, optional): Scope to limit the search. Defaults to None. + limit (int, optional): Limit number of search results. Defaults to 20. + + Returns: + [List(_dict)]: Search results + """ + ix = self.get_index() + + results = None + out = [] + + with ix.searcher() as searcher: + parser = MultifieldParser(["title", "content"], ix.schema) + parser.remove_plugin_class(FieldsPlugin) + parser.remove_plugin_class(WildcardPlugin) + query = parser.parse(text) + + filter_scoped = None + if scope: + filter_scoped = Prefix(self.id, scope) + results = searcher.search(query, limit=limit, filter=filter_scoped) + + for r in results: + out.append(self.parse_result(r)) + + return out + +def get_index_path(index_name): + return frappe.get_site_path("indexes", index_name) \ No newline at end of file diff --git a/frappe/search/test_full_text_search.py b/frappe/search/test_full_text_search.py new file mode 100644 index 0000000000..be9669d3c5 --- /dev/null +++ b/frappe/search/test_full_text_search.py @@ -0,0 +1,128 @@ +# Copyright (c) 2020, Frappe Technologies Pvt. Ltd. and Contributors +# MIT License. See license.txt +import unittest +from frappe.search.full_text_search import FullTextSearch + +class TestFullTextSearch(unittest.TestCase): + + def setUp(self): + index = get_index() + index.build() + self.index = index + + def test_search_term(self): + # Search Wikipedia + res = self.index.search("multilingual online encyclopedia") + self.assertEqual(res[0], 'site/wikipedia') + + res = self.index.search("Linux kernel") + self.assertEqual(res[0], 'os/linux') + + res = self.index.search("Enterprise Resource Planning") + self.assertEqual(res[0], 'sw/erpnext') + + def test_search_limit(self): + res = self.index.search("CommonSearchTerm") + self.assertEqual(len(res), 5) + + res = self.index.search("CommonSearchTerm", limit=3) + self.assertEqual(len(res), 3) + + res = self.index.search("CommonSearchTerm", limit=20) + self.assertEqual(len(res), 5) + + def test_search_scope(self): + # Search outside scope + res = self.index.search("multilingual online encyclopedia", scope=["os"]) + self.assertEqual(len(res), 0) + + # Search inside scope + res = self.index.search("CommonSearchTerm", scope=["os"]) + self.assertEqual(len(res), 2) + self.assertTrue('os/linux' in res) + self.assertTrue('os/gnu' in res) + + def test_remove_document_from_index(self): + self.index.remove_document_from_index("os/gnu") + res = self.index.search("GNU") + self.assertEqual(len(res), 0) + + def test_update_index(self): + # Update existing index + self.index.update_index({ + 'name': "sw/erpnext", + 'content': """AwesomeERPNext""" + }) + + res = self.index.search("CommonSearchTerm") + self.assertTrue('sw/erpnext' not in res) + + res = self.index.search("AwesomeERPNext") + self.assertEqual(res[0], "sw/erpnext") + + # Update new doc + self.index.update_index({ + 'name': "sw/frappebooks", + 'content': """DesktopAccounting""" + }) + + res = self.index.search("DesktopAccounting") + self.assertEqual(res[0], "sw/frappebooks") + + + +class TestWrapper(FullTextSearch): + def get_items_to_index(self): + return get_documents() + + def get_document_to_index(self, name): + documents = get_documents() + for doc in documents: + if doc["name"] == name: + return doc + + def parse_result(self, result): + return result["name"] + + +def get_index(): + return TestWrapper("test_frappe_index") + +def get_documents(): + docs = [] + docs.append({ + 'name': "site/wikipedia", + 'content': """Wikipedia is a multilingual online encyclopedia created and maintained + as an open collaboration project by a community of volunteer editors using a wiki-based editing system. + It is the largest and most popular general reference work on the World Wide Web. CommonSearchTerm""" + }) + + docs.append({ + 'name': "os/linux", + 'content': """Linux is a family of open source Unix-like operating systems based on the + Linux kernel, an operating system kernel first released on September 17, 1991, by Linus Torvalds. + Linux is typically packaged in a Linux distribution. CommonSearchTerm""" + }) + + docs.append({ + 'name': "os/gnu", + 'content': """GNU is an operating system and an extensive collection of computer software. + GNU is composed wholly of free software, most of which is licensed under the GNU Project's own + General Public License. GNU is a recursive acronym for "GNU's Not Unix! ", + chosen because GNU's design is Unix-like, but differs from Unix by being free software and containing no Unix code. CommonSearchTerm""" + }) + + docs.append({ + 'name': "sw/erpnext", + 'content': """ERPNext is a free and open-source integrated Enterprise Resource Planning software developed by + Frappe Technologies Pvt. Ltd. and is built on MariaDB database system using a Python based server-side framework. + ERPNext is a generic ERP software used by manufacturers, distributors and services companies. CommonSearchTerm""" + }) + + docs.append({ + 'name': "sw/frappe", + 'content': """Frappe Framework is a full-stack web framework, that includes everything you need to build and + deploy business applications with Rich Admin Interface. CommonSearchTerm""" + }) + + return docs \ No newline at end of file diff --git a/frappe/search/website_search.py b/frappe/search/website_search.py new file mode 100644 index 0000000000..de93fea3f5 --- /dev/null +++ b/frappe/search/website_search.py @@ -0,0 +1,117 @@ +# Copyright (c) 2020, Frappe Technologies Pvt. Ltd. and Contributors +# MIT License. See license.txt + +from __future__ import unicode_literals +import frappe +from bs4 import BeautifulSoup +from whoosh.fields import TEXT, ID, Schema +from frappe.search.full_text_search import FullTextSearch +from frappe.website.render import render_page +from frappe.utils import set_request +import os + +INDEX_NAME = "web_routes" + +class WebsiteSearch(FullTextSearch): + """ Wrapper for WebsiteSearch """ + + def get_schema(self): + return Schema( + title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True) + ) + + def get_id(self): + return "path" + + def get_items_to_index(self): + """Get all routes to be indexed, this includes the static pages + in www/ and routes from published documents + + Returns: + self (object): FullTextSearch Instance + """ + routes = get_static_pages_from_all_apps() + routes += get_doctype_routes_with_web_view() + + documents = [self.get_document_to_index(route) for route in routes] + return documents + + def get_document_to_index(self, route): + """Render a page and parse it using BeautifulSoup + + Args: + path (str): route of the page to be parsed + + Returns: + document (_dict): A dictionary with title, path and content + """ + frappe.set_user("Guest") + frappe.local.no_cache = True + + try: + set_request(method="GET", path=route) + content = render_page(route) + soup = BeautifulSoup(content, "html.parser") + page_content = soup.find(class_="page_content") + text_content = page_content.text if page_content else "" + title = soup.title.text.strip() if soup.title else route + + return frappe._dict(title=title, content=text_content, path=route) + except Exception: + pass + finally: + frappe.set_user("Administrator") + + def parse_result(self, result): + title_highlights = result.highlights("title") + content_highlights = result.highlights("content") + + return frappe._dict( + title=result["title"], + path=result["path"], + title_highlights=title_highlights, + content_highlights=content_highlights, + ) + + +def get_doctype_routes_with_web_view(): + all_routes = [] + filters = { "has_web_view": 1, "allow_guest_to_view": 1, "index_web_pages_for_search": 1} + fields = ["name", "is_published_field"] + doctype_with_web_views = frappe.get_all("DocType", filters=filters, fields=fields) + + for doctype in doctype_with_web_views: + if doctype.is_published_field: + routes = frappe.get_all(doctype.name, filters={doctype.is_published_field: 1}, fields="route") + all_routes += [route.route for route in routes] + + return all_routes + +def get_static_pages_from_all_apps(): + from glob import glob + apps = frappe.get_installed_apps() + + routes_to_index = [] + for app in apps: + path_to_index = frappe.get_app_path(app, 'www') + + files_to_index = glob(path_to_index + '/**/*.html', recursive=True) + files_to_index.extend(glob(path_to_index + '/**/*.md', recursive=True)) + for file in files_to_index: + route = os.path.relpath(file, path_to_index).split('.')[0] + if route.endswith('index'): + route = route.rsplit('index', 1)[0] + routes_to_index.append(route) + return routes_to_index + +def update_index_for_path(path): + ws = WebsiteSearch(INDEX_NAME) + return ws.update_index_by_name(path) + +def remove_document_from_index(path): + ws = WebsiteSearch(INDEX_NAME) + return ws.remove_document_from_index(path) + +def build_index_for_all_routes(): + ws = WebsiteSearch(INDEX_NAME) + return ws.build() \ No newline at end of file diff --git a/frappe/templates/doc.html b/frappe/templates/doc.html index 3e1cc5509a..3a566a1227 100644 --- a/frappe/templates/doc.html +++ b/frappe/templates/doc.html @@ -22,7 +22,7 @@