seitime-frappe/frappe/utils/global_search.py
Ben Knowles ecfc0714bc
fix: global search index breaking on long titles
If you created a doc with a title longer than 140 characters, it's supposed to be truncated to 140.

However, the global search index would break for that doctype due to an error in the query.

The truncate was happening AFTER it was escaped, causing it to lose the closing quote mark which caused a SQL error and broke the index.
2020-01-02 14:33:57 -06:00

533 lines
15 KiB
Python

# Copyright (c) 2015, Frappe Technologies Pvt. Ltd. and Contributors
# License: GNU General Public License v3. See license.txt
from __future__ import unicode_literals
import frappe
import re
import redis
import json
import os
from bs4 import BeautifulSoup
from frappe.utils import cint, strip_html_tags
from frappe.model.base_document import get_controller
from six import text_type
def setup_global_search_table():
"""
Creates __global_search table
:return:
"""
frappe.db.create_global_search_table()
def reset():
"""
Deletes all data in __global_search
:return:
"""
frappe.db.sql('DELETE FROM `__global_search`')
def get_doctypes_with_global_search(with_child_tables=True):
"""
Return doctypes with global search fields
:param with_child_tables:
:return:
"""
def _get():
global_search_doctypes = []
filters = {}
if not with_child_tables:
filters = {"istable": ["!=", 1], "issingle": ["!=", 1]}
for d in frappe.get_all('DocType', fields=['name', 'module'], filters=filters):
meta = frappe.get_meta(d.name)
if len(meta.get_global_search_fields()) > 0:
global_search_doctypes.append(d)
installed_apps = frappe.get_installed_apps()
module_app = frappe.local.module_app
doctypes = [
d.name for d in global_search_doctypes
if module_app.get(frappe.scrub(d.module))
and module_app[frappe.scrub(d.module)] in installed_apps
]
return doctypes
return frappe.cache().get_value('doctypes_with_global_search', _get)
def rebuild_for_doctype(doctype):
"""
Rebuild entries of doctype's documents in __global_search on change of
searchable fields
:param doctype: Doctype
"""
if frappe.local.conf.get('disable_global_search'):
return
if frappe.local.conf.get('disable_global_search'):
return
def _get_filters():
filters = frappe._dict({ "docstatus": ["!=", 2] })
if meta.has_field("enabled"):
filters.enabled = 1
if meta.has_field("disabled"):
filters.disabled = 0
return filters
meta = frappe.get_meta(doctype)
if cint(meta.istable) == 1:
parent_doctypes = frappe.get_all("DocField", fields="parent", filters={
"fieldtype": ["in", frappe.model.table_fields],
"options": doctype
})
for p in parent_doctypes:
rebuild_for_doctype(p.parent)
return
# Delete records
delete_global_search_records_for_doctype(doctype)
parent_search_fields = meta.get_global_search_fields()
fieldnames = get_selected_fields(meta, parent_search_fields)
# Get all records from parent doctype table
all_records = frappe.get_all(doctype, fields=fieldnames, filters=_get_filters())
# Children data
all_children, child_search_fields = get_children_data(doctype, meta)
all_contents = []
for doc in all_records:
content = []
for field in parent_search_fields:
value = doc.get(field.fieldname)
if value:
content.append(get_formatted_value(value, field))
# get children data
for child_doctype, records in all_children.get(doc.name, {}).items():
for field in child_search_fields.get(child_doctype):
for r in records:
if r.get(field.fieldname):
content.append(get_formatted_value(r.get(field.fieldname), field))
if content:
# if doctype published in website, push title, route etc.
published = 0
title, route = "", ""
try:
if hasattr(get_controller(doctype), "is_website_published") and meta.allow_guest_to_view:
d = frappe.get_doc(doctype, doc.name)
published = 1 if d.is_website_published() else 0
title = d.get_title()
route = d.get("route")
except ImportError:
# some doctypes has been deleted via future patch, hence controller does not exists
pass
all_contents.append({
"doctype": frappe.db.escape(doctype),
"name": frappe.db.escape(doc.name),
"content": frappe.db.escape(' ||| '.join(content or '')),
"published": published,
"title": frappe.db.escape((title or '')[:int(frappe.db.VARCHAR_LEN)]),
"route": frappe.db.escape((route or '')[:int(frappe.db.VARCHAR_LEN)])
})
if all_contents:
insert_values_for_multiple_docs(all_contents)
def delete_global_search_records_for_doctype(doctype):
frappe.db.sql('''DELETE
FROM `__global_search`
WHERE doctype = %s''', doctype, as_dict=True)
def get_selected_fields(meta, global_search_fields):
fieldnames = [df.fieldname for df in global_search_fields]
if meta.istable==1:
fieldnames.append("parent")
elif "name" not in fieldnames:
fieldnames.append("name")
if meta.has_field("is_website_published"):
fieldnames.append("is_website_published")
return fieldnames
def get_children_data(doctype, meta):
"""
Get all records from all the child tables of a doctype
all_children = {
"parent1": {
"child_doctype1": [
{
"field1": val1,
"field2": val2
}
]
}
}
"""
all_children = frappe._dict()
child_search_fields = frappe._dict()
for child in meta.get_table_fields():
child_meta = frappe.get_meta(child.options)
search_fields = child_meta.get_global_search_fields()
if search_fields:
child_search_fields.setdefault(child.options, search_fields)
child_fieldnames = get_selected_fields(child_meta, search_fields)
child_records = frappe.get_all(child.options, fields=child_fieldnames, filters={
"docstatus": ["!=", 1],
"parenttype": doctype
})
for record in child_records:
all_children.setdefault(record.parent, frappe._dict())\
.setdefault(child.options, []).append(record)
return all_children, child_search_fields
def insert_values_for_multiple_docs(all_contents):
values = []
for content in all_contents:
values.append("({doctype}, {name}, {content}, {published}, {title}, {route})"
.format(**content))
batch_size = 50000
for i in range(0, len(values), batch_size):
batch_values = values[i:i + batch_size]
# ignoring duplicate keys for doctype_name
frappe.db.multisql({
'mariadb': '''INSERT IGNORE INTO `__global_search`
(doctype, name, content, published, title, route)
VALUES {0} '''.format(", ".join(batch_values)),
'postgres': '''INSERT INTO `__global_search`
(doctype, name, content, published, title, route)
VALUES {0}
ON CONFLICT("name", "doctype") DO NOTHING'''.format(", ".join(batch_values))
})
def update_global_search(doc):
"""
Add values marked with `in_global_search` to
`global_search_queue` from given doc
:param doc: Document to be added to global search
"""
if frappe.local.conf.get('disable_global_search'):
return
if frappe.local.conf.get('disable_global_search'):
return
if doc.docstatus > 1 or (doc.meta.has_field("enabled") and not doc.get("enabled")) \
or doc.get("disabled"):
return
content = []
for field in doc.meta.get_global_search_fields():
if doc.get(field.fieldname) and field.fieldtype not in frappe.model.table_fields:
content.append(get_formatted_value(doc.get(field.fieldname), field))
# Get children
for child in doc.meta.get_table_fields():
for d in doc.get(child.fieldname):
if d.parent == doc.name:
for field in d.meta.get_global_search_fields():
if d.get(field.fieldname):
content.append(get_formatted_value(d.get(field.fieldname), field))
if content:
published = 0
if hasattr(doc, 'is_website_published') and doc.meta.allow_guest_to_view:
published = 1 if doc.is_website_published() else 0
title = (doc.get_title() or '')[:int(frappe.db.VARCHAR_LEN)]
route = doc.get('route') if doc else ''
value = dict(
doctype=doc.doctype,
name=doc.name,
content=' ||| '.join(content or ''),
published=published,
title=title,
route=route
)
sync_value_in_queue(value)
def update_global_search_for_all_web_pages():
routes_to_index = get_routes_to_index()
for route in routes_to_index:
add_route_to_global_search(route)
sync_global_search()
def get_routes_to_index():
apps = frappe.get_installed_apps()
routes_to_index = []
for app in apps:
base = frappe.get_app_path(app, 'www')
path_to_index = frappe.get_app_path(app, 'www')
for dirpath, _, filenames in os.walk(path_to_index, topdown=True):
for f in filenames:
if f.endswith(('.md', '.html')):
filepath = os.path.join(dirpath, f)
route = os.path.relpath(filepath, base)
route = route.split('.')[0]
if route.endswith('index'):
route = route.rsplit('index', 1)[0]
routes_to_index.append(route)
return routes_to_index
def add_route_to_global_search(route):
from frappe.website.render import render_page
from frappe.utils import set_request
frappe.set_user('Guest')
frappe.local.no_cache = True
try:
set_request(method='GET', path=route)
content = render_page(route)
soup = BeautifulSoup(content, 'html.parser')
page_content = soup.find(class_='page_content')
text_content = page_content.text if page_content else ''
title = soup.title.text.strip() if soup.title else route
value = dict(
doctype='Static Web Page',
name=route,
content=text_content,
published=1,
title=title,
route=route
)
sync_value_in_queue(value)
except (frappe.PermissionError, frappe.DoesNotExistError, frappe.ValidationError, Exception):
pass
frappe.set_user('Administrator')
def get_formatted_value(value, field):
"""
Prepare field from raw data
:param value:
:param field:
:return:
"""
from six.moves.html_parser import HTMLParser
if getattr(field, 'fieldtype', None) in ["Text", "Text Editor"]:
h = HTMLParser()
value = h.unescape(frappe.safe_decode(value))
value = (re.subn(r'<[\s]*(script|style).*?</\1>(?s)', '', text_type(value))[0])
value = ' '.join(value.split())
return field.label + " : " + strip_html_tags(text_type(value))
def sync_global_search():
"""
Inserts / updates values from `global_search_queue` to __global_search.
This is called via job scheduler
:param flags:
:return:
"""
while frappe.cache().llen('global_search_queue') > 0:
value = json.loads(frappe.cache().lpop('global_search_queue').decode('utf-8'))
sync_value(value)
def sync_value_in_queue(value):
try:
# append to search queue if connected
frappe.cache().lpush('global_search_queue', json.dumps(value))
except redis.exceptions.ConnectionError:
# not connected, sync directly
sync_value(value)
def sync_value(value):
'''
Sync a given document to global search
:param value: dict of { doctype, name, content, published, title, route }
'''
frappe.db.multisql({
'mariadb': '''INSERT INTO `__global_search`
(`doctype`, `name`, `content`, `published`, `title`, `route`)
VALUES (%(doctype)s, %(name)s, %(content)s, %(published)s, %(title)s, %(route)s)
ON DUPLICATE key UPDATE
`content`=%(content)s,
`published`=%(published)s,
`title`=%(title)s,
`route`=%(route)s
''',
'postgres': '''INSERT INTO `__global_search`
(`doctype`, `name`, `content`, `published`, `title`, `route`)
VALUES (%(doctype)s, %(name)s, %(content)s, %(published)s, %(title)s, %(route)s)
ON CONFLICT("doctype", "name") DO UPDATE SET
`content`=%(content)s,
`published`=%(published)s,
`title`=%(title)s,
`route`=%(route)s
'''
}, value)
def delete_for_document(doc):
"""
Delete the __global_search entry of a document that has
been deleted
:param doc: Deleted document
"""
frappe.db.sql('''DELETE
FROM `__global_search`
WHERE doctype = %s
AND name = %s''', (doc.doctype, doc.name), as_dict=True)
@frappe.whitelist()
def search(text, start=0, limit=20, doctype=""):
"""
Search for given text in __global_search
:param text: phrase to be searched
:param start: start results at, default 0
:param limit: number of results to return, default 20
:return: Array of result objects
"""
from frappe.desk.doctype.global_search_settings.global_search_settings import get_doctypes_for_global_search
results = []
sorted_results = []
allowed_doctypes = get_doctypes_for_global_search()
for text in set(text.split('&')):
text = text.strip()
if not text:
continue
conditions = '1=1'
offset = ''
mariadb_text = frappe.db.escape('+' + text + '*')
mariadb_fields = '`doctype`, `name`, `content`, MATCH (`content`) AGAINST ({} IN BOOLEAN MODE) AS rank'.format(mariadb_text)
postgres_fields = '`doctype`, `name`, `content`, TO_TSVECTOR("content") @@ PLAINTO_TSQUERY({}) AS rank'.format(frappe.db.escape(text))
values = {}
if doctype:
conditions = '`doctype` = %(doctype)s'
values['doctype'] = doctype
elif allowed_doctypes:
conditions = '`doctype` IN %(allowed_doctypes)s'
values['allowed_doctypes'] = tuple(allowed_doctypes)
if int(start) > 0:
offset = 'OFFSET {}'.format(start)
common_query = """
SELECT {fields}
FROM `__global_search`
WHERE {conditions}
ORDER BY rank DESC
LIMIT {limit}
{offset}
"""
result = frappe.db.multisql({
'mariadb': common_query.format(fields=mariadb_fields, conditions=conditions, limit=limit, offset=offset),
'postgres': common_query.format(fields=postgres_fields, conditions=conditions, limit=limit, offset=offset)
}, values=values, as_dict=True)
results.extend(result)
# sort results based on allowed_doctype's priority
for doctype in allowed_doctypes:
for index, r in enumerate(results):
if r.doctype == doctype and r.rank > 0.0:
try:
meta = frappe.get_meta(r.doctype)
if meta.image_field:
r.image = frappe.db.get_value(r.doctype, r.name, meta.image_field)
except Exception:
frappe.clear_messages()
sorted_results.extend([r])
return sorted_results
@frappe.whitelist(allow_guest=True)
def web_search(text, scope=None, start=0, limit=20):
"""
Search for given text in __global_search where published = 1
:param text: phrase to be searched
:param scope: search only in this route, for e.g /docs
:param start: start results at, default 0
:param limit: number of results to return, default 20
:return: Array of result objects
"""
results = []
texts = text.split('&')
for text in texts:
common_query = ''' SELECT `doctype`, `name`, `content`, `title`, `route`
FROM `__global_search`
WHERE {conditions}
LIMIT {limit} OFFSET {start}'''
scope_condition = '`route` like "{}%" AND '.format(scope) if scope else ''
published_condition = '`published` = 1 AND '
mariadb_conditions = postgres_conditions = ' '.join([published_condition, scope_condition])
# https://mariadb.com/kb/en/library/full-text-index-overview/#in-boolean-mode
text = '"{}"'.format(text)
mariadb_conditions += 'MATCH(`content`) AGAINST ({} IN BOOLEAN MODE)'.format(frappe.db.escape(text))
postgres_conditions += 'TO_TSVECTOR("content") @@ PLAINTO_TSQUERY({})'.format(frappe.db.escape(text))
result = frappe.db.multisql({
'mariadb': common_query.format(conditions=mariadb_conditions, limit=limit, start=start),
'postgres': common_query.format(conditions=postgres_conditions, limit=limit, start=start)
}, as_dict=True)
tmp_result=[]
for i in result:
if i in results or not results:
tmp_result.append(i)
results += tmp_result
# chart of accounts -> {chart, of, accounts}
# titles that match the most of these words will have high relevance
words = set(get_distinct_words(text))
for r in results:
title_words = set(get_distinct_words(r.title))
words_match = len(words.intersection(title_words))
r.relevance = words_match
results = sorted(results, key=lambda x: x.relevance, reverse=True)
return results
def get_distinct_words(text):
text = text.replace('"', '')
text = text.replace("'", '')
return [w.strip().lower() for w in text.split(' ')]