From b696fa6da57cd23291358a1e70046ff96a40d5a3 Mon Sep 17 00:00:00 2001 From: Gavin D'souza Date: Fri, 3 Jun 2022 15:36:20 +0530 Subject: [PATCH] perf: Pre-compile and re-use regexp pattern Converted all possible usages of re.* that weren't compiling the regex separately and re-using it. Separated out the compiled patterns as global variables. Repetitive patterns could be made DRY-er. Would be nicer to have all regexes in a single module so that we could re-use better, keep track of outdated, and keep checks for possible reDos' etc --- frappe/build.py | 6 ++- frappe/core/doctype/data_import/importer.py | 7 ++- frappe/core/doctype/doctype/doctype.py | 21 ++++----- frappe/database/database.py | 19 ++++---- frappe/database/postgres/database.py | 20 ++++----- frappe/database/query.py | 7 ++- frappe/database/schema.py | 8 ++-- frappe/desk/doctype/note/note.py | 8 ++-- frappe/email/email_body.py | 12 +++--- frappe/email/receive.py | 10 +++-- frappe/model/db_query.py | 20 ++++++--- frappe/model/naming.py | 5 ++- frappe/model/utils/__init__.py | 9 ++-- frappe/recorder.py | 3 +- .../templates/includes/comments/comments.py | 12 +++--- frappe/translate.py | 6 ++- frappe/utils/__init__.py | 29 ++++++++----- frappe/utils/boilerplate.py | 4 +- frappe/utils/data.py | 43 +++++++++++-------- frappe/utils/formatters.py | 4 +- frappe/utils/global_search.py | 4 +- frappe/utils/html_utils.py | 31 +++++++------ frappe/utils/password_strength.py | 4 +- frappe/utils/xlsxutils.py | 2 +- frappe/website/doctype/web_page/web_page.py | 4 +- frappe/website/utils.py | 33 ++++++++------ frappe/www/app.py | 7 ++- 27 files changed, 194 insertions(+), 144 deletions(-) diff --git a/frappe/build.py b/frappe/build.py index e20ee0d698..5923bd05ec 100644 --- a/frappe/build.py +++ b/frappe/build.py @@ -20,6 +20,8 @@ import frappe timestamps = {} app_paths = None sites_path = os.path.abspath(os.getcwd()) +WHITESPACE_PATTERN = re.compile(r"\s+") +HTML_COMMENT_PATTERN = re.compile(r"()") class AssetsNotDownloadedError(Exception): @@ -406,10 +408,10 @@ def link_assets_dir(source, target, hard_link=False): def scrub_html_template(content): """Returns HTML content with removed whitespace and comments""" # remove whitespace to a single space - content = re.sub(r"\s+", " ", content) + content = WHITESPACE_PATTERN.sub(" ", content) # strip comments - content = re.sub(r"()", "", content) + content = HTML_COMMENT_PATTERN.sub("", content) return content.replace("'", "'") diff --git a/frappe/core/doctype/data_import/importer.py b/frappe/core/doctype/data_import/importer.py index 01be69ea16..06d7588aef 100644 --- a/frappe/core/doctype/data_import/importer.py +++ b/frappe/core/doctype/data_import/importer.py @@ -4,6 +4,7 @@ import io import json import os +import re import timeit from datetime import date, datetime @@ -22,6 +23,7 @@ INVALID_VALUES = ("", None) MAX_ROWS_IN_PREVIEW = 10 INSERT = "Insert New Records" UPDATE = "Update Existing Records" +DURATION_PATTERN = re.compile(r"^(?:(\d+d)?((^|\s)\d+h)?((^|\s)\d+m)?((^|\s)\d+s)?)$") class Importer: @@ -725,10 +727,7 @@ class Row: ) return elif df.fieldtype == "Duration": - import re - - is_valid_duration = re.match(r"^(?:(\d+d)?((^|\s)\d+h)?((^|\s)\d+m)?((^|\s)\d+s)?)$", value) - if not is_valid_duration: + if not DURATION_PATTERN.match(value): self.warnings.append( { "row": self.row_number, diff --git a/frappe/core/doctype/doctype/doctype.py b/frappe/core/doctype/doctype/doctype.py index 047c48e9d5..3e58146ae7 100644 --- a/frappe/core/doctype/doctype/doctype.py +++ b/frappe/core/doctype/doctype/doctype.py @@ -35,6 +35,12 @@ from frappe.query_builder.functions import Concat from frappe.utils import cint from frappe.website.utils import clear_cache +DEPENDS_ON_PATTERN = re.compile(r'[\w\.:_]+\s*={1}\s*[\w\.@\'"]+') +ILLEGAL_FIELDNAME_PATTERN = re.compile("""['",./%@()<>{}]""") +WHITESPACE_PADDING_PATTERN = re.compile(r"^[ \t\n\r]+|[ \t\n\r]+$", flags=re.ASCII) +START_WITH_LETTERS_PATTERN = re.compile(r"^(?![\W])[^\d_\s][\w -]+$", flags=re.ASCII) +FIELD_PATTERN = re.compile("{(.*?)}", flags=re.UNICODE) + class InvalidFieldNameError(frappe.ValidationError): pass @@ -357,8 +363,7 @@ class DocType(Document): else: if d.fieldname in restricted: frappe.throw(_("Fieldname {0} is restricted").format(d.fieldname), InvalidFieldNameError) - - d.fieldname = re.sub("""['",./%@()<>{}]""", "", d.fieldname) + d.fieldname = ILLEGAL_FIELDNAME_PATTERN.sub("", d.fieldname) # fieldnames should be lowercase d.fieldname = d.fieldname.lower() @@ -842,15 +847,13 @@ class DocType(Document): _("Doctype name is limited to {0} characters ({1})").format(max_length, name), frappe.NameError ) - flags = {"flags": re.ASCII} - # a DocType name should not start or end with an empty space - if re.search(r"^[ \t\n\r]+|[ \t\n\r]+$", name, **flags): + if WHITESPACE_PADDING_PATTERN.search(name): frappe.throw(_("DocType's name should not start or end with whitespace"), frappe.NameError) # a DocType's name should not start with a number or underscore # and should only contain letters, numbers, underscore, and hyphen - if not re.match(r"^(?![\W])[^\d_\s][\w -]+$", name, **flags): + if not START_WITH_LETTERS_PATTERN.match(name): frappe.throw( _( "A DocType's name should start with a letter and can only " @@ -1254,7 +1257,7 @@ def validate_fields(meta): if not pattern: return - for fieldname in re.findall("{(.*?)}", pattern, re.UNICODE): + for fieldname in FIELD_PATTERN.findall(pattern): if fieldname.startswith("{"): # edge case when double curlies are used for escape continue @@ -1336,9 +1339,7 @@ def validate_fields(meta): ] for field in depends_on_fields: depends_on = docfield.get(field, None) - if ( - depends_on and ("=" in depends_on) and re.match(r'[\w\.:_]+\s*={1}\s*[\w\.@\'"]+', depends_on) - ): + if depends_on and ("=" in depends_on) and DEPENDS_ON_PATTERN.match(depends_on): frappe.throw(_("Invalid {0} condition").format(frappe.unscrub(field)), frappe.ValidationError) def check_table_multiselect_option(docfield): diff --git a/frappe/database/database.py b/frappe/database/database.py index 8f4e91fa63..1de22af037 100644 --- a/frappe/database/database.py +++ b/frappe/database/database.py @@ -23,6 +23,11 @@ from frappe.query_builder.functions import Count from frappe.query_builder.utils import DocType from frappe.utils import cast, get_datetime, get_table_name, getdate, now, sbool +IFNULL_PATTERN = re.compile(r"ifnull\(", flags=re.IGNORECASE) +INDEX_PATTERN = re.compile(r"\s*\([^)]+\)\s*") +SINGLE_WORD_PATTERN = re.compile(r'([`"]?)(tab([A-Z]\w+))\1') +MULTI_WORD_PATTERN = re.compile(r'([`"])(tab([A-Z]\w+)( [A-Z]\w+)+)\1') + class Database(object): """ @@ -143,9 +148,8 @@ class Database(object): # remove whitespace / indentation from start and end of query query = query.strip() - if re.search(r"ifnull\(", query, flags=re.IGNORECASE): - # replaces ifnull in query with coalesce - query = re.sub(r"ifnull\(", "coalesce(", query, flags=re.IGNORECASE) + # replaces ifnull in query with coalesce + query = IFNULL_PATTERN.sub("coalesce(", query) if not self._conn: self.connect() @@ -1126,8 +1130,7 @@ class Database(object): def get_index_name(fields): index_name = "_".join(fields) + "_index" # remove index length if present e.g. (10) from index name - index_name = re.sub(r"\s*\([^)]+\)\s*", r"", index_name) - return index_name + return INDEX_PATTERN.sub(r"", index_name) def get_system_setting(self, key): def _load_system_settings(): @@ -1219,11 +1222,9 @@ class Database(object): # and are continued with multiple words that start with a captital letter # e.g. 'tabXxx' or 'tabXxx Xxx' or 'tabXxx Xxx Xxx' and so on - single_word_regex = r'([`"]?)(tab([A-Z]\w+))\1' - multi_word_regex = r'([`"])(tab([A-Z]\w+)( [A-Z]\w+)+)\1' tables = [] - for regex in (single_word_regex, multi_word_regex): - tables += [groups[1] for groups in re.findall(regex, query)] + for regex in (SINGLE_WORD_PATTERN, MULTI_WORD_PATTERN): + tables += [groups[1] for groups in regex.findall(query)] if frappe.flags.touched_tables is None: frappe.flags.touched_tables = set() diff --git a/frappe/database/postgres/database.py b/frappe/database/postgres/database.py index 8bd4113823..14872b2b16 100644 --- a/frappe/database/postgres/database.py +++ b/frappe/database/postgres/database.py @@ -20,6 +20,11 @@ DEC2FLOAT = psycopg2.extensions.new_type( psycopg2.extensions.register_type(DEC2FLOAT) +LOCATE_SUB_PATTERN = re.compile(r"locate\(([^,]+),([^)]+)(\)?)\)", flags=re.IGNORECASE) +LOCATE_QUERY_PATTERN = re.compile(r"locate\(", flags=re.IGNORECASE) +PG_TRANSFORM_PATTERN = re.compile(r"([=><]+)\s*([+-]?\d+)(\.0)?(?![a-zA-Z\.\d])") +FROM_TAB_PATTERN = re.compile(r"from tab([\w-]*)", flags=re.IGNORECASE) + class PostgresDatabase(Database): ProgrammingError = psycopg2.ProgrammingError @@ -382,12 +387,10 @@ class PostgresDatabase(Database): def modify_query(query): """ "Modifies query according to the requirements of postgres""" # replace ` with " for definitions - query = str(query) - query = query.replace("`", '"') + query = str(query).replace("`", '"') query = replace_locate_with_strpos(query) # select from requires "" - if re.search("from tab", query, flags=re.IGNORECASE): - query = re.sub(r"from tab([\w-]*)", r'from "tab\1"', query, flags=re.IGNORECASE) + query = FROM_TAB_PATTERN.sub(r'from "tab\1"', query) # only find int (with/without signs), ignore decimals (with/without signs), ignore hashes (which start with numbers), # drop .0 from decimals and add quotes around them @@ -396,8 +399,7 @@ def modify_query(query): # >>> re.sub(r"([=><]+)\s*([+-]?\d+)(\.0)?(?![a-zA-Z\.\d])", r"\1 '\2'", query) # "c='abcd' , a >= '45', b = '-45', c = '40', d= '4500', e=3500.53, f=40psdfsd, g= '9092094312', h=12.00023 - query = re.sub(r"([=><]+)\s*([+-]?\d+)(\.0)?(?![a-zA-Z\.\d])", r"\1 '\2'", query) - return query + return PG_TRANSFORM_PATTERN.sub(r"\1 '\2'", query) def modify_values(values): @@ -430,8 +432,6 @@ def modify_values(values): def replace_locate_with_strpos(query): # strpos is the locate equivalent in postgres - if re.search(r"locate\(", query, flags=re.IGNORECASE): - query = re.sub( - r"locate\(([^,]+),([^)]+)(\)?)\)", r"strpos(\2\3, \1)", query, flags=re.IGNORECASE - ) + if LOCATE_QUERY_PATTERN.search(query): + query = LOCATE_SUB_PATTERN.sub(r"strpos(\2\3, \1)", query) return query diff --git a/frappe/database/query.py b/frappe/database/query.py index cfeafd6a37..f7cc143cf7 100644 --- a/frappe/database/query.py +++ b/frappe/database/query.py @@ -9,6 +9,9 @@ from frappe.boot import get_additional_filters_from_hooks from frappe.model.db_query import get_timespan_date_range from frappe.query_builder import Criterion, Field, Order, Table +TAB_PATTERN = re.compile("^tab") +WORDS_PATTERN = re.compile(r"\w+") + def like(key: Field, value: str) -> frappe.qb: """Wrapper method for `LIKE` @@ -391,7 +394,7 @@ class Permission: doctype = [doctype] for dt in doctype: - dt = re.sub("^tab", "", dt) + dt = TAB_PATTERN.sub("", dt) if not frappe.has_permission( dt, "select", @@ -407,4 +410,4 @@ class Permission: @staticmethod def get_tables_from_query(query: str): - return [table for table in re.findall(r"\w+", query) if table.startswith("tab")] + return [table for table in WORDS_PATTERN.findall(query) if table.startswith("tab")] diff --git a/frappe/database/schema.py b/frappe/database/schema.py index 19af447aae..9a8307ddae 100644 --- a/frappe/database/schema.py +++ b/frappe/database/schema.py @@ -4,6 +4,9 @@ import frappe from frappe import _ from frappe.utils import cint, cstr, flt +SPECIAL_CHAR_PATTERN = re.compile(r"[\W]", flags=re.UNICODE) +VARCHAR_CAST_PATTERN = re.compile(r"varchar\(([\d]+)\)") + class InvalidColumnName(frappe.ValidationError): pass @@ -130,7 +133,7 @@ class DBTable: if not current_col: continue current_type = self.current_columns[col.fieldname]["type"] - current_length = re.findall(r"varchar\(([\d]+)\)", current_type) + current_length = VARCHAR_CAST_PATTERN.findall(current_type) if not current_length: # case when the field is no longer a varchar continue @@ -304,8 +307,7 @@ class DbColumn: def validate_column_name(n): - special_characters = re.findall(r"[\W]", n, re.UNICODE) - if special_characters: + if special_characters := SPECIAL_CHAR_PATTERN.findall(n): special_characters = ", ".join('"{0}"'.format(c) for c in special_characters) frappe.throw( _("Fieldname {0} cannot have special characters like {1}").format( diff --git a/frappe/desk/doctype/note/note.py b/frappe/desk/doctype/note/note.py index d67ecda594..a709b80f1d 100644 --- a/frappe/desk/doctype/note/note.py +++ b/frappe/desk/doctype/note/note.py @@ -1,16 +1,18 @@ # Copyright (c) 2015, Frappe Technologies Pvt. Ltd. and Contributors # License: MIT. See LICENSE +import re + import frappe from frappe.model.document import Document +NAME_PATTERN = re.compile("[%'\"#*?`]") + class Note(Document): def autoname(self): # replace forbidden characters - import re - - self.name = re.sub("[%'\"#*?`]", "", self.title.strip()) + self.name = NAME_PATTERN.sub("", self.title.strip()) def validate(self): if self.notify_on_login and not self.expire_notification_on: diff --git a/frappe/email/email_body.py b/frappe/email/email_body.py index 5e2f14d9bf..50c66e1ad2 100755 --- a/frappe/email/email_body.py +++ b/frappe/email/email_body.py @@ -24,6 +24,8 @@ from frappe.utils import ( ) from frappe.utils.pdf import get_pdf +EMBED_PATTERN = re.compile("""embed=["'](.*?)["']""") + def get_email( recipients, @@ -190,7 +192,7 @@ class EMail: def set_part_html(self, message, inline_images): from email.mime.text import MIMEText - has_inline_images = re.search("""embed=['"].*?['"]""", message) + has_inline_images = EMBED_PATTERN.search(message) if has_inline_images: # process inline images @@ -499,7 +501,7 @@ def replace_filename_with_cid(message): inline_images = [] while True: - matches = re.search("""embed=["'](.*?)["']""", message) + matches = EMBED_PATTERN.search(message) if not matches: break groups = matches.groups() @@ -510,7 +512,7 @@ def replace_filename_with_cid(message): filecontent = get_filecontent_from_path(img_path) if not filecontent: - message = re.sub("""embed=['"]{0}['"]""".format(img_path), "", message) + message = re.sub(f"""embed=['"]{img_path}['"]""", "", message) continue content_id = random_string(10) @@ -519,9 +521,7 @@ def replace_filename_with_cid(message): {"filename": filename, "filecontent": filecontent, "content_id": content_id} ) - message = re.sub( - """embed=['"]{0}['"]""".format(img_path), 'src="cid:{0}"'.format(content_id), message - ) + message = re.sub(f"""embed=['"]{img_path}['"]""", f'src="cid:{content_id}"', message) return (message, inline_images) diff --git a/frappe/email/receive.py b/frappe/email/receive.py index 51474d111d..12ab04eb4b 100644 --- a/frappe/email/receive.py +++ b/frappe/email/receive.py @@ -38,6 +38,9 @@ from frappe.utils.user import is_system_user # fix due to a python bug in poplib that limits it to 2048 poplib._MAXLINE = 20480 +THREAD_ID_PATTERN = re.compile(r"(?<=\[)[\w/-]+") +WORDS_PATTERN = re.compile(r"\w+") + class EmailSizeExceededError(frappe.ValidationError): pass @@ -273,7 +276,7 @@ class EmailServer: return def parse_imap_response(self, cmd, response): - pattern = r"(?<={cmd} )[0-9]*".format(cmd=cmd) + pattern = rf"(?<={cmd} )[0-9]*" match = re.search(pattern, response.decode("utf-8"), re.U | re.I) if match: @@ -332,8 +335,7 @@ class EmailServer: flags = [] for flag in imaplib.ParseFlags(flag_string) or []: - pattern = re.compile(r"\w+") - match = re.search(pattern, frappe.as_unicode(flag)) + match = WORDS_PATTERN.search(frappe.as_unicode(flag)) flags.append(match.group(0)) if "Seen" in flags: @@ -622,7 +624,7 @@ class Email: def get_thread_id(self): """Extract thread ID from `[]`""" - l = re.findall(r"(?<=\[)[\w/-]+", self.subject) + l = THREAD_ID_PATTERN.findall(self.subject) return l and l[0] or None def is_reply(self): diff --git a/frappe/model/db_query.py b/frappe/model/db_query.py index c101b5eb35..eab0e5359b 100644 --- a/frappe/model/db_query.py +++ b/frappe/model/db_query.py @@ -29,6 +29,14 @@ from frappe.utils import ( make_filter_tuple, ) +LOCATE_PATTERN = re.compile(r"locate\([^,]+,\s*[`\"]?name[`\"]?\s*\)") +LOCATE_CAST_PATTERN = re.compile(r"locate\(([^,]+),\s*([`\"]?name[`\"]?)\s*\)") +FUNC_IFNULL_PATTERN = re.compile(r"(strpos|ifnull|coalesce)\(\s*[`\"]?name[`\"]?\s*,") +CAST_VARCHAR_PATTERN = re.compile(r"([`\"]?tab[\w`\" -]+\.[`\"]?name[`\"]?)(?!\w)") +ORDER_BY_PATTERN = re.compile( + r"\ order\ by\ |\ asc|\ ASC|\ desc|\ DESC", +) + class DatabaseQuery(object): def __init__(self, doctype, user=None): @@ -266,7 +274,7 @@ class DatabaseQuery(object): return args def prepare_select_args(self, args): - order_field = re.sub(r"\ order\ by\ |\ asc|\ ASC|\ desc|\ DESC", "", args.order_by) + order_field = ORDER_BY_PATTERN.sub("", args.order_by) if order_field not in args.fields: extracted_column = order_column = order_field.replace("`", "") @@ -957,16 +965,14 @@ def cast_name(column: str) -> str: kwargs = {"string": column, "flags": re.IGNORECASE} if "cast(" not in column.lower() and "::" not in column: - if re.search(r"locate\([^,]+,\s*[`\"]?name[`\"]?\s*\)", **kwargs): - return re.sub( - r"locate\(([^,]+),\s*([`\"]?name[`\"]?)\s*\)", r"locate(\1, cast(\2 as varchar))", **kwargs - ) + if LOCATE_PATTERN.search(**kwargs): + return LOCATE_CAST_PATTERN.sub(r"locate(\1, cast(\2 as varchar))", **kwargs) - elif match := re.search(r"(strpos|ifnull|coalesce)\(\s*[`\"]?name[`\"]?\s*,", **kwargs): + elif match := FUNC_IFNULL_PATTERN.search(**kwargs): func = match.groups()[0] return re.sub(rf"{func}\(\s*([`\"]?name[`\"]?)\s*,", rf"{func}(cast(\1 as varchar),", **kwargs) - return re.sub(r"([`\"]?tab[\w`\" -]+\.[`\"]?name[`\"]?)(?!\w)", r"cast(\1 as varchar)", **kwargs) + return CAST_VARCHAR_PATTERN.sub(r"cast(\1 as varchar)", **kwargs) return column diff --git a/frappe/model/naming.py b/frappe/model/naming.py index d63466e556..f6a3846699 100644 --- a/frappe/model/naming.py +++ b/frappe/model/naming.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: autoincremented_site_status_map = {} NAMING_SERIES_PATTERN = re.compile(r"^[\w\- \/.#{}]+$", re.UNICODE) +BRACED_PARAMS_PATTERN = re.compile(r"(\{[\w | #]+\})") class InvalidNamingSeriesError(frappe.ValidationError): @@ -448,7 +449,7 @@ def validate_name(doctype: str, name: Union[int, str], case: Optional[str] = Non frappe.throw(_("Name of {0} cannot be {1}").format(doctype, name), frappe.NameError) special_characters = "<>" - if re.findall("[{0}]+".format(special_characters), name): + if re.findall(f"[{special_characters}]+", name): message = ", ".join("'{0}'".format(c) for c in special_characters) frappe.throw( _("Name cannot contain special characters like {0}").format(message), frappe.NameError @@ -535,6 +536,6 @@ def _format_autoname(autoname, doc): return parse_naming_series([trimmed_param], doc=doc) # Replace braced params with their parsed value - name = re.sub(r"(\{[\w | #]+\})", get_param_value_for_match, autoname_value) + name = BRACED_PARAMS_PATTERN.sub(get_param_value_for_match, autoname_value) return name diff --git a/frappe/model/utils/__init__.py b/frappe/model/utils/__init__.py index a0dd0d89e8..6385b61c38 100644 --- a/frappe/model/utils/__init__.py +++ b/frappe/model/utils/__init__.py @@ -21,10 +21,7 @@ STANDARD_FIELD_CONVERSION_MAP = { "_assign": "Text", "docstatus": "Int", } - -""" -Model utilities, unclassified functions -""" +INCLUDE_DIRECTIVE_PATTERN = re.compile(r"""{% include\s['"](.*)['"]\s%}""") def set_default(doc, key): @@ -67,7 +64,7 @@ def render_include(content): # try 5 levels of includes for i in range(5): if "{% include" in content: - paths = re.findall(r"""{% include\s['"](.*)['"]\s%}""", content) + paths = INCLUDE_DIRECTIVE_PATTERN.findall(content) if not paths: frappe.throw(_("Invalid include path"), InvalidIncludePath) @@ -78,7 +75,7 @@ def render_include(content): if path.endswith(".html"): include = html_to_js_template(path, include) - content = re.sub(r"""{{% include\s['"]{0}['"]\s%}}""".format(path), include, content) + content = re.sub(rf"""{{% include\s['"]{path}['"]\s%}}""", include, content) else: break diff --git a/frappe/recorder.py b/frappe/recorder.py index 95b78dd085..87e001fe31 100644 --- a/frappe/recorder.py +++ b/frappe/recorder.py @@ -16,6 +16,7 @@ from frappe import _ RECORDER_INTERCEPT_FLAG = "recorder-intercept" RECORDER_REQUEST_SPARSE_HASH = "recorder-requests-sparse" RECORDER_REQUEST_HASH = "recorder-requests" +TRACEBACK_PATH_PATTERN = re.compile(".*/apps/") def sql(*args, **kwargs): @@ -58,7 +59,7 @@ def get_current_stack_frames(): for frame, filename, lineno, function, context, index in list(reversed(frames))[:-2]: if "/apps/" in filename: yield { - "filename": re.sub(".*/apps/", "", filename), + "filename": TRACEBACK_PATH_PATTERN.sub("", filename), "lineno": lineno, "function": function, } diff --git a/frappe/templates/includes/comments/comments.py b/frappe/templates/includes/comments/comments.py index dd94cfa989..b98106c0dc 100644 --- a/frappe/templates/includes/comments/comments.py +++ b/frappe/templates/includes/comments/comments.py @@ -10,6 +10,11 @@ from frappe.utils.html_utils import clean_html from frappe.website.doctype.blog_settings.blog_settings import get_comment_limit from frappe.website.utils import clear_cache +URLS_COMMENT_PATTERN = re.compile( + r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.IGNORECASE +) +EMAIL_PATTERN = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.IGNORECASE) + @frappe.whitelist(allow_guest=True) @rate_limit(key="reference_name", limit=get_comment_limit, seconds=60 * 60) @@ -23,12 +28,7 @@ def add_comment(comment, comment_email, comment_by, reference_doctype, reference frappe.msgprint(_("The comment cannot be empty")) return False - url_regex = re.compile( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.IGNORECASE - ) - email_regex = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.IGNORECASE) - - if url_regex.search(comment) or email_regex.search(comment): + if URLS_COMMENT_PATTERN.search(comment) or EMAIL_PATTERN.search(comment): frappe.msgprint(_("Comments cannot have links or email addresses")) return False diff --git a/frappe/translate.py b/frappe/translate.py index 3123eade48..eb26124ba8 100644 --- a/frappe/translate.py +++ b/frappe/translate.py @@ -48,6 +48,8 @@ TRANSLATE_PATTERN = re.compile( # END: JS context search r"[\s\n]*\)" # Closing function call ignore leading whitespace/newlines ) +REPORT_TRANSLATE_PATTERN = re.compile('"([^:,^"]*):') +CSV_STRIP_WHITESPACE_PATTERN = re.compile(r"{\s?([0-9]+)\s?}") def get_language(lang_list: List = None) -> str: @@ -602,7 +604,7 @@ def get_messages_from_report(name): messages.extend( [ (None, message) - for message in re.findall('"([^:,^"]*):', report.query) + for message in REPORT_TRANSLATE_PATTERN.findall(report.query) if is_translatable(message) ] ) @@ -801,7 +803,7 @@ def write_csv_file(path, app_messages, lang_dict): t = lang_dict.get(message, "") # strip whitespaces - translated_string = re.sub(r"{\s?([0-9]+)\s?}", r"{\g<1>}", t) + translated_string = CSV_STRIP_WHITESPACE_PATTERN.sub(r"{\g<1>}", t) if translated_string: w.writerow([message, translated_string, context]) diff --git a/frappe/utils/__init__.py b/frappe/utils/__init__.py index 1af0ec6a39..e06a0c8cba 100644 --- a/frappe/utils/__init__.py +++ b/frappe/utils/__init__.py @@ -27,6 +27,16 @@ import frappe from frappe.utils.data import * from frappe.utils.html_utils import sanitize_html +EMAIL_NAME_PATTERN = re.compile(r"[^A-Za-z0-9\u00C0-\u024F\/\_\' ]+") +EMAIL_STRING_PATTERN = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)") +NON_MD_HTML_PATTERN = re.compile(r"|") +HTML_TAGS_PATTERN = re.compile(r"\<[^>]*\>") +INCLUDE_DIRECTIVE_PATTERN = re.compile("""({% include ['"]([^'"]*)['"] %})""") +PHONE_NUMBER_PATTERN = re.compile(r"([0-9\ \+\_\-\,\.\*\#\(\)]){1,20}$") +PERSON_NAME_PATTERN = re.compile(r"^[\w][\w\'\-]*( \w[\w\'\-]*)*$") +WHITESPACE_PATTERN = re.compile(r"[\t\n\r]") +MULTI_EMAIL_STRING_PATTERN = re.compile(r'[,\n](?=(?:[^"]|"[^"]*")*$)') + def get_fullname(user=None): """get the full name (first name + last name) of the user from User""" @@ -116,7 +126,7 @@ def validate_phone_number(phone_number, throw=False): return False phone_number = phone_number.strip() - match = re.match(r"([0-9\ \+\_\-\,\.\*\#\(\)]){1,20}$", phone_number) + match = PHONE_NUMBER_PATTERN.match(phone_number) if not match and throw: frappe.throw( @@ -135,7 +145,7 @@ def validate_name(name, throw=False): return False name = name.strip() - match = re.match(r"^[\w][\w\'\-]*( \w[\w\'\-]*)*$", name) + match = PERSON_NAME_PATTERN.match(PERSON_NAME_PATTERN, name) if not match and throw: frappe.throw(frappe._("{0} is not a valid Name").format(name), frappe.InvalidNameError) @@ -201,8 +211,8 @@ def split_emails(txt): email_list = [] # emails can be separated by comma or newline - s = re.sub(r"[\t\n\r]", " ", cstr(txt)) - for email in re.split(r'[,\n](?=(?:[^"]|"[^"]*")*$)', s): + s = WHITESPACE_PATTERN.sub(" ", cstr(txt)) + for email in MULTI_EMAIL_STRING_PATTERN.split(s): email = strip(cstr(email)) if email: email_list.append(email) @@ -360,7 +370,7 @@ def remove_blanks(d): def strip_html_tags(text): """Remove html tags from text""" - return re.sub(r"\<[^>]*\>", "", text) + return HTML_TAGS_PATTERN.sub("", text) def get_file_timestamp(fn): @@ -584,7 +594,7 @@ def get_html_format(print_path): with open(print_path, "r") as f: html_format = f.read() - for include_directive, path in re.findall("""({% include ['"]([^'"]*)['"] %})""", html_format): + for include_directive, path in INCLUDE_DIRECTIVE_PATTERN.findall(html_format): for app_name in frappe.get_installed_apps(): include_path = frappe.get_app_path(app_name, *path.split(os.path.sep)) if os.path.exists(include_path): @@ -601,7 +611,7 @@ def is_markdown(text): elif "" in text: return False else: - return not re.search(r"|", text) + return not NON_MD_HTML_PATTERN.search(text) def get_sites(sites_path=None): @@ -670,8 +680,7 @@ def parse_addr(email_string): name = get_name_from_email_string(email_string, email, name) return (name, email) else: - email_regex = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)") - email_list = re.findall(email_regex, email_string) + email_list = EMAIL_STRING_PATTERN.findall(email_string) if len(email_list) > 0 and check_format(email_list[0]): # take only first email address email = email_list[0] @@ -698,7 +707,7 @@ def check_format(email_id): def get_name_from_email_string(email_string, email_id, name): name = email_string.replace(email_id, "") - name = re.sub(r"[^A-Za-z0-9\u00C0-\u024F\/\_\' ]+", "", name).strip() + name = EMAIL_NAME_PATTERN.sub("", name).strip() if not name: name = email_id return name diff --git a/frappe/utils/boilerplate.py b/frappe/utils/boilerplate.py index 22ca64eb1a..05a227fb5e 100644 --- a/frappe/utils/boilerplate.py +++ b/frappe/utils/boilerplate.py @@ -11,6 +11,8 @@ import git import frappe from frappe.utils import touch_file +APP_TITLE_PATTERN = re.compile(r"^(?![\W])[^\d_\s][\w -]+$") + def make_boilerplate(dest, app_name, no_git=False): if not os.path.exists(dest): @@ -67,7 +69,7 @@ def _get_user_inputs(app_name): def is_valid_title(title) -> bool: - if not re.match(r"^(?![\W])[^\d_\s][\w -]+$", title, re.UNICODE): + if not APP_TITLE_PATTERN.match(title, re.UNICODE): print( "App Title should start with a letter and it can only consist of letters, numbers, spaces and underscores" ) diff --git a/frappe/utils/data.py b/frappe/utils/data.py index 0b79580009..60770ef6a9 100644 --- a/frappe/utils/data.py +++ b/frappe/utils/data.py @@ -29,7 +29,22 @@ if typing.TYPE_CHECKING: DATE_FORMAT = "%Y-%m-%d" TIME_FORMAT = "%H:%M:%S.%f" -DATETIME_FORMAT = DATE_FORMAT + " " + TIME_FORMAT +DATETIME_FORMAT = f"{DATE_FORMAT} {TIME_FORMAT}" +TIMEDELTA_DAY_PATTERN = re.compile( + r"(?P[-\d]+) day[s]*, (?P\d+):(?P\d+):(?P\d[\.\d+]*)" +) +TIMEDELTA_BASE_PATTERN = re.compile(r"(?P\d+):(?P\d+):(?P\d[\.\d+]*)") +URLS_HTTP_TAG_PATTERN = re.compile( + r'(href|src){1}([\s]*=[\s]*[\'"]?)((?:http)[^\'">]+)([\'"]?)' +) # href='https://... +URLS_NOT_HTTP_TAG_PATTERN = re.compile( + r'(href|src){1}([\s]*=[\s]*[\'"]?)((?!http)[^\'" >]+)([\'"]?)' +) # href=/assets/... +URL_NOTATION_PATTERN = re.compile( + r'(:[\s]?url)(\([\'"]?)((?!http)[^\'" >]+)([\'"]?\))' +) # background-image: url('/assets/...') +DURATION_PATTERN = re.compile(r"^(?:(\d+d)?((^|\s)\d+h)?((^|\s)\d+m)?((^|\s)\d+s)?)$") +HTML_TAG_PATTERN = re.compile("<[^>]+>") class Weekday(Enum): @@ -692,10 +707,7 @@ def duration_to_seconds(duration): def validate_duration_format(duration): - import re - - is_valid_duration = re.match(r"^(?:(\d+d)?((^|\s)\d+h)?((^|\s)\d+m)?((^|\s)\d+s)?)$", duration) - if not is_valid_duration: + if not DURATION_PATTERN.match(duration): frappe.throw( frappe._("Value {0} must be in the valid duration format: d h m s").format( frappe.bold(duration) @@ -1297,7 +1309,7 @@ def in_words(integer: int, in_million=True) -> str: def is_html(text: str) -> bool: if not isinstance(text, str): return False - return re.search("<[^>]+>", text) + return HTML_TAG_PATTERN.search(text) def is_image(filepath: str) -> bool: @@ -1851,12 +1863,8 @@ def expand_relative_urls(html: str) -> str: return "".join(to_expand) - html = re.sub( - r'(href|src){1}([\s]*=[\s]*[\'"]?)((?!http)[^\'" >]+)([\'"]?)', _expand_relative_urls, html - ) - - # background-image: url('/assets/...') - html = re.sub(r'(:[\s]?url)(\([\'"]?)((?!http)[^\'" >]+)([\'"]?\))', _expand_relative_urls, html) + html = URLS_NOT_HTTP_TAG_PATTERN.sub(_expand_relative_urls, html) + html = URL_NOTATION_PATTERN.sub(_expand_relative_urls, html) return html @@ -1870,7 +1878,7 @@ def quote_urls(html: str) -> str: groups[2] = quoted(groups[2]) return "".join(groups) - return re.sub(r'(href|src){1}([\s]*=[\s]*[\'"]?)((?:http)[^\'">]+)([\'"]?)', _quote_url, html) + return URLS_HTTP_TAG_PATTERN.sub(_quote_url, html) def unique(seq: typing.Sequence["T"]) -> List["T"]: @@ -1891,8 +1899,7 @@ def get_string_between(start: str, string: str, end: str) -> str: if not string: return "" - regex = "{0}(.*){1}".format(start, end) - out = re.search(regex, string) + out = re.search(f"{start}(.*){end}", string) return out.group(1) if out else string @@ -2098,10 +2105,8 @@ def format_timedelta(o: datetime.timedelta) -> str: def parse_timedelta(s: str) -> datetime.timedelta: # ref: https://stackoverflow.com/a/21074460/10309266 if "day" in s: - m = re.match( - r"(?P[-\d]+) day[s]*, (?P\d+):(?P\d+):(?P\d[\.\d+]*)", s - ) + m = TIMEDELTA_DAY_PATTERN.match(s) else: - m = re.match(r"(?P\d+):(?P\d+):(?P\d[\.\d+]*)", s) + m = TIMEDELTA_BASE_PATTERN.match(s) return datetime.timedelta(**{key: float(val) for key, val in m.groupdict().items()}) diff --git a/frappe/utils/formatters.py b/frappe/utils/formatters.py index adf551580c..575a05a5c2 100644 --- a/frappe/utils/formatters.py +++ b/frappe/utils/formatters.py @@ -20,6 +20,8 @@ from frappe.utils import ( formatdate, ) +BLOCK_TAGS_PATTERN = re.compile(r"(") elif df.get("fieldtype") == "Markdown Editor": diff --git a/frappe/utils/global_search.py b/frappe/utils/global_search.py index 6e482baa78..af6d9a3c28 100644 --- a/frappe/utils/global_search.py +++ b/frappe/utils/global_search.py @@ -13,6 +13,8 @@ from frappe.utils import cint, strip_html_tags from frappe.utils.data import cstr from frappe.utils.html_utils import unescape_html +HTML_TAGS_PATTERN = re.compile(r"(?s)<[\s]*(script|style).*?") + def setup_global_search_table(): """ @@ -360,7 +362,7 @@ def get_formatted_value(value, field): if getattr(field, "fieldtype", None) in ["Text", "Text Editor"]: value = unescape_html(frappe.safe_decode(value)) - value = re.subn(r"(?s)<[\s]*(script|style).*?", "", str(value))[0] + value = HTML_TAGS_PATTERN.subn("", str(value))[0] value = " ".join(value.split()) return field.label + " : " + strip_html_tags(str(value)) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index 8eac761220..b9d0e8dfe2 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -5,6 +5,16 @@ from bleach_allowlist import bleach_allowlist import frappe +EMOJI_PATTERN = re.compile( + "(\ud83d[\ude00-\ude4f])|" + "(\ud83c[\udf00-\uffff])|" + "(\ud83d[\u0000-\uddff])|" + "(\ud83d[\ude80-\udeff])|" + "(\ud83c[\udde0-\uddff])" + "+", + flags=re.UNICODE, +) + def clean_html(html): import bleach @@ -181,28 +191,17 @@ def is_json(text): def get_icon_html(icon, small=False): from frappe.utils import is_image - emoji_pattern = re.compile( - "(\ud83d[\ude00-\ude4f])|" - "(\ud83c[\udf00-\uffff])|" - "(\ud83d[\u0000-\uddff])|" - "(\ud83d[\ude80-\udeff])|" - "(\ud83c[\udde0-\uddff])" - "+", - flags=re.UNICODE, - ) - icon = icon or "" - if icon and emoji_pattern.match(icon): - return '' + icon + "" + + if icon and EMOJI_PATTERN.match(icon): + return f'{icon}' if is_image(icon): return ( - ''.format(icon=icon) - if small - else ''.format(icon=icon) + f'' if small else f'' ) else: - return "".format(icon=icon) + return f"" def unescape_html(value): diff --git a/frappe/utils/password_strength.py b/frappe/utils/password_strength.py index 1f7a171ce9..59c784e5b4 100644 --- a/frappe/utils/password_strength.py +++ b/frappe/utils/password_strength.py @@ -177,9 +177,9 @@ def get_dictionary_match_feedback(match, is_sole_match): word = match.get("token") # Variations of the match like UPPERCASES - if re.match(scoring.START_UPPER, word): + if scoring.START_UPPER.match(word): suggestions.append(_("Capitalization doesn't help very much.")) - elif re.match(scoring.ALL_UPPER, word): + elif scoring.ALL_UPPER.match(word): suggestions.append(_("All-uppercase is almost as easy to guess as all-lowercase.")) # Match contains l33t speak substitutions diff --git a/frappe/utils/xlsxutils.py b/frappe/utils/xlsxutils.py index ad02cd8327..1b898f69a2 100644 --- a/frappe/utils/xlsxutils.py +++ b/frappe/utils/xlsxutils.py @@ -40,7 +40,7 @@ def make_xlsx(data, sheet_name, wb=None, column_widths=None): if isinstance(item, str) and next(ILLEGAL_CHARACTERS_RE.finditer(value), None): # Remove illegal characters from the string - value = re.sub(ILLEGAL_CHARACTERS_RE, "", value) + value = ILLEGAL_CHARACTERS_RE.sub("", value) clean_row.append(value) diff --git a/frappe/website/doctype/web_page/web_page.py b/frappe/website/doctype/web_page/web_page.py index f74af1d8c7..bd7bcb8de4 100644 --- a/frappe/website/doctype/web_page/web_page.py +++ b/frappe/website/doctype/web_page/web_page.py @@ -19,6 +19,8 @@ from frappe.website.utils import ( ) from frappe.website.website_generator import WebsiteGenerator +H_TAG_PATTERN = re.compile("") + class WebPage(WebsiteGenerator): def validate(self): @@ -114,7 +116,7 @@ class WebPage(WebsiteGenerator): context.header = context.title # add h1 tag to header - if context.get("header") and not re.findall("", context.header): + if context.get("header") and not H_TAG_PATTERN.findall(context.header): context.header = "

" + context.header + "

" # if title not set, set title from header diff --git a/frappe/website/utils.py b/frappe/website/utils.py index f673a20656..18a1844419 100644 --- a/frappe/website/utils.py +++ b/frappe/website/utils.py @@ -15,6 +15,13 @@ from frappe import _ from frappe.model.document import Document from frappe.utils import md_to_html +FRONTMATTER_PATTERN = re.compile(r"^\s*(?:---|\+\+\+)(.*?)(?:---|\+\+\+)\s*(.+)$", re.S | re.M) +H1_TAG_PATTERN = re.compile("

([^<]*)") +IMAGE_TAG_PATTERN = re.compile(r"""]*src\s?=\s?['"]([^'"]*)['"]""") +CLEANUP_PATTERN_1 = re.compile(r'[~!@#$%^&*+()<>,."\'\?]') +CLEANUP_PATTERN_2 = re.compile("[:/]") +CLEANUP_PATTERN_3 = re.compile(r"(-)\1+") + def delete_page_cache(path): cache = frappe.cache() @@ -29,7 +36,7 @@ def delete_page_cache(path): def find_first_image(html): - m = re.finditer(r"""]*src\s?=\s?['"]([^'"]*)['"]""", html) + m = IMAGE_TAG_PATTERN.finditer(html) try: return next(m).groups()[0] except StopIteration: @@ -156,17 +163,17 @@ def is_signup_disabled(): return frappe.db.get_single_value("Website Settings", "disable_signup", True) -def cleanup_page_name(title): +def cleanup_page_name(title: str) -> str: """make page name from title""" if not title: return "" name = title.lower() - name = re.sub(r'[~!@#$%^&*+()<>,."\'\?]', "", name) - name = re.sub("[:/]", "-", name) + name = CLEANUP_PATTERN_1.sub("", name) + name = CLEANUP_PATTERN_2.sub("-", name) name = "-".join(name.split()) # replace repeating hyphens - name = re.sub(r"(-)\1+", r"\1", name) + name = CLEANUP_PATTERN_3.sub(r"\1", name) return name[:140] @@ -287,8 +294,8 @@ def extract_title(source, path): if not title and "

" in source: # extract title from h1 - match = re.findall("

([^<]*)", source) - title_content = match[0].strip()[:300] + match = H1_TAG_PATTERN.search(source).group() + title_content = match.strip()[:300] if "{{" not in title_content: title = title_content @@ -308,17 +315,16 @@ def extract_title(source, path): return title -def extract_comment_tag(source, tag): +def extract_comment_tag(source: str, tag: str): """Extract custom tags in comments from source. :param source: raw template source in HTML :param title: tag to search, example "title" """ - if "".format(tag), source)[0].strip() - else: - return None + if f"", source).group().strip() + return None def get_html_content_based_on_type(doc, fieldname, content_type): @@ -378,7 +384,8 @@ def get_frontmatter(string): "Reference: https://github.com/jonbeebe/frontmatter" frontmatter = "" body = "" - result = re.compile(r"^\s*(?:---|\+\+\+)(.*?)(?:---|\+\+\+)\s*(.+)$", re.S | re.M).search(string) + result = FRONTMATTER_PATTERN.search(string) + if result: frontmatter = result.group(1) body = result.group(2) diff --git a/frappe/www/app.py b/frappe/www/app.py index ae0dad3326..f1b62a0899 100644 --- a/frappe/www/app.py +++ b/frappe/www/app.py @@ -10,6 +10,9 @@ import frappe.sessions from frappe import _ from frappe.utils.jinja_globals import is_rtl +SCRIPT_TAG_PATTERN = re.compile(r"\") +CLOSING_SCRIPT_TAG_PATTERN = re.compile(r"") + def get_context(context): if frappe.session.user == "Guest": @@ -34,10 +37,10 @@ def get_context(context): boot_json = frappe.as_json(boot) # remove script tags from boot - boot_json = re.sub(r"\", "", boot_json) + boot_json = SCRIPT_TAG_PATTERN.sub("", boot_json) # TODO: Find better fix - boot_json = re.sub(r"", "", boot_json) + boot_json = CLOSING_SCRIPT_TAG_PATTERN.sub("", boot_json) context.update( {