diff --git a/cypress/integration/recorder.js b/cypress/integration/recorder.js deleted file mode 100644 index de95a852fc..0000000000 --- a/cypress/integration/recorder.js +++ /dev/null @@ -1,72 +0,0 @@ -context.skip("Recorder", () => { - before(() => { - cy.login(); - }); - - beforeEach(() => { - cy.visit("/app/recorder"); - return cy - .window() - .its("frappe") - .then((frappe) => { - // reset recorder - return frappe.xcall("frappe.recorder.stop").then(() => { - return frappe.xcall("frappe.recorder.delete"); - }); - }); - }); - - it("Recorder Empty State", () => { - cy.get(".page-head").findByTitle("Recorder").should("exist"); - - cy.get(".indicator-pill").should("contain", "Inactive").should("have.class", "red"); - - cy.get(".page-actions").findByRole("button", { name: "Start" }).should("exist"); - cy.get(".page-actions").findByRole("button", { name: "Clear" }).should("exist"); - - cy.get(".msg-box").should("contain", "Recorder is Inactive"); - cy.get(".msg-box").findByRole("button", { name: "Start Recording" }).should("exist"); - }); - - it("Recorder Start", () => { - cy.get(".page-actions").findByRole("button", { name: "Start" }).click(); - cy.get(".indicator-pill").should("contain", "Active").should("have.class", "green"); - - cy.get(".msg-box").should("contain", "No Requests found"); - - cy.visit("/app/List/DocType/List"); - cy.intercept("POST", "/api/method/frappe.desk.reportview.get").as("list_refresh"); - cy.wait("@list_refresh"); - - cy.get(".page-head").findByTitle("DocType").should("exist"); - cy.get(".list-count").should("contain", "20 of "); - - cy.visit("/app/recorder"); - cy.get(".page-head").findByTitle("Recorder").should("exist"); - cy.get(".frappe-list .result-list").should( - "contain", - "/api/method/frappe.desk.reportview.get" - ); - }); - - it("Recorder View Request", () => { - cy.get(".page-actions").findByRole("button", { name: "Start" }).click(); - - cy.visit("/app/List/DocType/List"); - cy.intercept("POST", "/api/method/frappe.desk.reportview.get").as("list_refresh"); - cy.wait("@list_refresh"); - - cy.get(".page-head").findByTitle("DocType").should("exist"); - cy.get(".list-count").should("contain", "20 of "); - - cy.visit("/app/recorder"); - - cy.get(".frappe-list .list-row-container span") - .contains("/api/method/frappe") - .should("be.visible") - .click({ force: true }); - - cy.url().should("include", "/recorder/request"); - cy.get("form").should("contain", "/api/method/frappe"); - }); -}); diff --git a/frappe/core/doctype/recorder/db_optimizer.py b/frappe/core/doctype/recorder/db_optimizer.py new file mode 100644 index 0000000000..28b6beca5d --- /dev/null +++ b/frappe/core/doctype/recorder/db_optimizer.py @@ -0,0 +1,283 @@ +"""Basic DB optimizer for Frappe Framework based app. + +This is largely based on heuristics and known good practices for indexing. +""" + +from collections import defaultdict +from dataclasses import dataclass +from typing import TypeVar + +from sql_metadata import Parser + +import frappe +from frappe.utils import flt + +# Any index that reads more than 30% table on average is not "useful" +INDEX_SCORE_THRESHOLD = 0.3 +# Anything reading less than this percent of table is considered optimal +OPTIMIZATION_THRESHOLD = 0.1 + +T = TypeVar("T") + + +@dataclass +class DBColumn: + name: str + cardinality: int | None + is_nullable: bool + default: str + data_type: str + + @classmethod + def from_frappe_ouput(cls, data) -> "DBColumn": + "Parse DBColumn from output of describe-database-table command in Frappe" + return cls( + name=data["column"], + cardinality=data.get("cardinality"), + is_nullable=data["is_nullable"], + default=data["default"], + data_type=data["type"], + ) + + +@dataclass +class DBIndex: + name: str + column: str + table: str + unique: bool | None = None + cardinality: int | None = None + sequence: int = 1 + nullable: bool = True + _score: float = 0.0 + + def __eq__(self, other: "DBIndex") -> bool: + return self.column == other.column and self.sequence == other.sequence and self.table == other.table + + def __repr__(self): + return f"DBIndex(`{self.table}`.`{self.column}`)" + + @classmethod + def from_frappe_ouput(cls, data, table) -> "DBIndex": + "Parse DBIndex from output of describe-database-table command in Frappe" + return cls( + name=data["name"], + table=table, + unique=data["unique"], + cardinality=data["cardinality"], + sequence=data["sequence"], + nullable=data["nullable"], + column=data["column"], + ) + + +@dataclass +class ColumnStat: + column_name: str + avg_frequency: float + avg_length: float + nulls_ratio: float | None = None + histogram: list[float] = None + + def __post_init__(self): + if not self.histogram: + self.histogram = [] + + @classmethod + def from_frappe_ouput(cls, data) -> "ColumnStat": + return cls( + column_name=data["column_name"], + avg_frequency=data["avg_frequency"], + avg_length=data["avg_length"], + nulls_ratio=data["nulls_ratio"], + histogram=[flt(bin) for bin in data["histogram"].split(",")] if data["histogram"] else [], + ) + + +@dataclass +class DBTable: + name: str + total_rows: int + schema: list[DBColumn] | None = None + indexes: list[DBIndex] | None = None + + def __post_init__(self): + if not self.schema: + self.schema = [] + if not self.indexes: + self.indexes = [] + + def update_cardinality(self, column_stats: list[ColumnStat]) -> None: + """Estimate cardinality using mysql.column_stat""" + for column_stat in column_stats: + for col in self.schema: + if col.name == column_stat.column_name and not col.cardinality and column_stat.avg_frequency: + # "hack" or "math" - average frequency is on average how frequently a row value appears. + # Avg = total_rows / cardinality, so... + col.cardinality = self.total_rows / column_stat.avg_frequency + + @classmethod + def from_frappe_ouput(cls, data) -> "DBTable": + "Parse DBTable from output of describe-database-table command in Frappe" + table_name = data["table_name"] + return cls( + name=table_name, + total_rows=data["total_rows"], + schema=[DBColumn.from_frappe_ouput(c) for c in data["schema"]], + indexes=[DBIndex.from_frappe_ouput(i, table_name) for i in data["indexes"]], + ) + + def has_column(self, column: str) -> bool: + for col in self.schema: + if col.name == column: + return True + return False + + +@dataclass +class DBOptimizer: + query: str # raw query in string format + tables: dict[str, DBTable] = None + parsed_query: Parser = None + + def __post_init__(self): + if not self.tables: + self.tables = {} + self.parsed_query = Parser(self.query) + + def tables_examined(self) -> list[str]: + return self.parsed_query.tables + + def update_table_data(self, table: DBTable): + self.tables[table.name] = table + + def _convert_to_db_index(self, column: str) -> DBIndex: + column_name, table = None, None + + if "." in column: + table, column_name = column.split(".") + else: + column_name = column + for table_name, db_table in self.tables.items(): + if db_table.has_column(column): + table = table_name + break + return DBIndex(column=column_name, name=column_name, table=table) + + def _remove_existing_indexes(self, potential_indexes: list[DBIndex]) -> list[DBIndex]: + """Given list of potential index candidates remove the ones that already exist. + + This also removes multi-column indexes for parts that are applicable to query. + Example: If multi-col index A+B+C exists and query utilizes A+B then + A+B are removed from potential indexes. + """ + + def remove_maximum_indexes(idx: list[DBIndex]): + """Try to remove entire index from potential indexes, if not possible, reduce one part and try again until no parts are left.""" + if not idx: + return + matched_sub_index = [] + for idx_part in list(idx): + matching_part = [ + i for i in potential_indexes if i.column == idx_part.column and i.table == idx_part.table + ] + if not matching_part: + # pop and recurse + idx.pop() + return remove_maximum_indexes(idx) + else: + matched_sub_index.extend(matching_part) + + # Every part matched now, lets remove those parts + for i in matched_sub_index: + potential_indexes.remove(i) + + # Reconstruct multi-col index + for table in self.tables.values(): + merged_indexes = defaultdict(list) + for index in table.indexes: + merged_indexes[index.name].append(index) + + for idx in merged_indexes.values(): + idx.sort(key=lambda x: x.sequence) + + for idx in merged_indexes.values(): + remove_maximum_indexes(idx) + return potential_indexes + + def potential_indexes(self) -> list[DBIndex]: + """Get all columns that can potentially be indexed to speed up this query.""" + + possible_indexes = [] + + # Where claus columns using these operators benefit from index + # 1. = (equality) + # 2. >, <, >=, <= + # 3. LIKE 'xyz%' (Prefix search) + # 4. BETWEEN (for date[time] fields) + # 5. IN (similar to equality) + + if not self.parsed_query.columns_dict: + return [] + + if where_columns := self.parsed_query.columns_dict.get("where"): + # TODO: Apply some heuristics here, not all columns in where clause are actually useful + possible_indexes.extend(where_columns) + + # Join clauses - Both sides of join should ideally be indexed. One will *usually* be primary key. + if join_columns := self.parsed_query.columns_dict.get("join"): + possible_indexes.extend(join_columns) + + # Top N query variant - Order by column can possibly speed up the query + if order_by_columns := self.parsed_query.columns_dict.get("order_by"): + if self.parsed_query.limit_and_offset: + possible_indexes.extend(order_by_columns) + + possible_db_indexes = [self._convert_to_db_index(i) for i in possible_indexes] + possible_db_indexes = [i for i in possible_db_indexes if i.column not in ("*", "name")] + possible_db_indexes.sort(key=lambda i: (i.table, i.column)) + + return self._remove_existing_indexes(possible_db_indexes) + + def suggest_index(self) -> DBIndex | None: + """Suggest best possible column to index given query and table stats.""" + if missing_tables := (set(self.tables_examined()) - set(self.tables.keys())): + frappe.throw("DBTable infomation missing for: " + ", ".join(missing_tables)) + + potential_indexes = self.potential_indexes() + + for index in list(potential_indexes): + table = self.tables[index.table] + + # Data type is not easily indexable - skip + column = next(c for c in table.schema if c.name == index.column) + if "text" in column.data_type.lower() or "json" in column.data_type.lower(): + potential_indexes.remove(index) + # Update cardinality from column so scoring can be done + index.cardinality = column.cardinality + + for index in potential_indexes: + index._score = self.index_score(index) + + potential_indexes.sort(key=lambda i: i._score) + if ( + potential_indexes + and (best_index := potential_indexes[0]) + and best_index._score < INDEX_SCORE_THRESHOLD + ): + return best_index + + def index_score(self, index: DBIndex) -> float: + """Score an index from 0 to 1 based on usefulness. + + A score of 0.5 indicates on average this index will read 50% of the table. (e.g. checkboxes)""" + table = self.tables[index.table] + + cardinality = index.cardinality or 2 + total_rows = table.total_rows or cardinality or 1 + + # We assume most unique values are evenly distributed, this is + # definitely not the case IRL but it should be good enough assumptions + # Score is rouhgly what percentage of table we will end up reading on typical query + rows_fetched_on_average = (table.total_rows or cardinality) / cardinality + return rows_fetched_on_average / total_rows diff --git a/frappe/core/doctype/recorder/recorder.js b/frappe/core/doctype/recorder/recorder.js index 2335424a76..e58f55c442 100644 --- a/frappe/core/doctype/recorder/recorder.js +++ b/frappe/core/doctype/recorder/recorder.js @@ -12,6 +12,16 @@ frappe.ui.form.on("Recorder", { frm.fields_dict.sql_queries.grid.grid_pagination.page_length = 500; refresh_field("sql_queries"); frm.trigger("format_grid"); + frm.add_custom_button(__("Suggest Optimizations"), () => { + frappe.xcall("frappe.core.doctype.recorder.recorder.optimize", { + recorder_id: frm.doc.name, + }); + }); + + frappe.realtime.on("recorder-analysis-complete", () => { + frm.reload_doc(); + setTimeout(() => frm.scroll_to_field("suggested_indexes"), 1500); + }); }, setup_sort: function (frm) { @@ -25,6 +35,7 @@ frappe.ui.form.on("Recorder", { frm._sort_order[field] = -1 * sort_order; // reverse for next click grid.refresh(); frm.trigger("setup_sort"); // grid creates new elements again, resetup listeners. + frm.trigger("format_grid"); }); }); }, diff --git a/frappe/core/doctype/recorder/recorder.json b/frappe/core/doctype/recorder/recorder.json index 72291dbfe2..efb6c1d065 100644 --- a/frappe/core/doctype/recorder/recorder.json +++ b/frappe/core/doctype/recorder/recorder.json @@ -20,6 +20,7 @@ "section_break_sgro", "form_dict", "section_break_9jhm", + "suggested_indexes", "sql_queries", "section_break_optn", "profile" @@ -119,6 +120,13 @@ "fieldtype": "Code", "label": "cProfile Output", "read_only": 1 + }, + { + "description": "Disclaimer: These indexes are suggested based on data and queries performed during this recording. These suggestions may or may not help.", + "fieldname": "suggested_indexes", + "fieldtype": "Table", + "label": "Suggested Indexes", + "options": "Recorder Suggested Index" } ], "hide_toolbar": 1, @@ -126,7 +134,7 @@ "index_web_pages_for_search": 1, "is_virtual": 1, "links": [], - "modified": "2024-02-01 22:13:26.505174", + "modified": "2024-05-14 15:16:55.626656", "modified_by": "Administrator", "module": "Core", "name": "Recorder", diff --git a/frappe/core/doctype/recorder/recorder.py b/frappe/core/doctype/recorder/recorder.py index 317bd9c148..cc89436578 100644 --- a/frappe/core/doctype/recorder/recorder.py +++ b/frappe/core/doctype/recorder/recorder.py @@ -1,10 +1,16 @@ # Copyright (c) 2023, Frappe Technologies and contributors # For license information, please see license.txt +from collections import Counter, defaultdict + import frappe +from frappe import _ +from frappe.core.doctype.recorder.db_optimizer import DBOptimizer, DBTable from frappe.model.document import Document +from frappe.recorder import RECORDER_REQUEST_HASH from frappe.recorder import get as get_recorder_data -from frappe.utils import cint, evaluate_filters +from frappe.utils import cint, cstr, evaluate_filters, get_table_name +from frappe.utils.caching import redis_cache class Recorder(Document): @@ -15,6 +21,9 @@ class Recorder(Document): if TYPE_CHECKING: from frappe.core.doctype.recorder_query.recorder_query import RecorderQuery + from frappe.core.doctype.recorder_suggested_index.recorder_suggested_index import ( + RecorderSuggestedIndex, + ) from frappe.types import DF cmd: DF.Data | None @@ -27,6 +36,7 @@ class Recorder(Document): profile: DF.Code | None request_headers: DF.Code | None sql_queries: DF.Table[RecorderQuery] + suggested_indexes: DF.Table[RecorderSuggestedIndex] time: DF.Datetime | None time_in_queries: DF.Float # end: auto-generated types @@ -95,8 +105,160 @@ def serialize_request(request): request_headers=frappe.as_json(request.get("headers"), indent=4), form_dict=frappe.as_json(request.get("form_dict"), indent=4), sql_queries=request.get("calls"), + suggested_indexes=request.get("suggested_indexes"), modified=request.get("time"), creation=request.get("time"), ) return request + + +@frappe.whitelist() +def optimize(recorder_id: str): + frappe.only_for("Administrator") + frappe.enqueue(_optimize, recorder_id=recorder_id, queue="long") + + +def _optimize(recorder_id): + record: Recorder = frappe.get_doc("Recorder", recorder_id) + total_duration = record.time_in_queries + + # Any index with query time less than 5% of total time is not suggested + PERCENT_DURATION_THRESHOLD_OVERALL = 0.05 + # Any query with duration less than 0.5% of total duration is not analyzed + PERCENT_DURATION_THRESHOLD_QUERY = 0.005 + + # Index suggestion -> Query duration + index_suggestions = Counter() + for idx, captured_query in enumerate(record.sql_queries, start=1): + query = cstr(captured_query.query) + frappe.publish_progress( + idx / len(record.sql_queries) * 100, + title="Analyzing Queries", + doctype=record.doctype, + docname=record.name, + description=f"Analyzing query: {query[:140]}", + ) + if captured_query.duration < total_duration * PERCENT_DURATION_THRESHOLD_QUERY: + continue + if not query.lower().strip().startswith(("select", "update", "delete")): + continue + if index := _optimize_query(query): + index_suggestions[(index.table, index.column)] += captured_query.duration + + suggested_indexes = index_suggestions.most_common(3) + suggested_indexes = [ + idx for idx in suggested_indexes if idx[1] > total_duration * PERCENT_DURATION_THRESHOLD_OVERALL + ] + + if not suggested_indexes: + frappe.msgprint(_("No optimization suggestions."), realtime=True) + return + + frappe.msgprint(_("Query analysis complete. Check suggested indexes."), realtime=True, alert=True) + data = frappe.cache.hget(RECORDER_REQUEST_HASH, record.name) + data["suggested_indexes"] = [{"table": idx[0][0], "column": idx[0][1]} for idx in suggested_indexes] + frappe.cache.hset(RECORDER_REQUEST_HASH, record.name, data) + frappe.publish_realtime("recorder-analysis-complete") + + +def _optimize_query(query): + optimizer = DBOptimizer(query=query) + tables = optimizer.tables_examined() + + # Note: Two passes are required here because we first need basic data to understand which + # columns need to be analyzed to get accurate cardinality. + for table in tables: + doctype = get_doctype_name(table) + stats = _fetch_table_stats(doctype, columns=[]) + if not stats: + return + db_table = DBTable.from_frappe_ouput(stats) + optimizer.update_table_data(db_table) + + potential_indexes = optimizer.potential_indexes() + tablewise_columns = defaultdict(list) + for idx in potential_indexes: + tablewise_columns[idx.table].append(idx.column) + + for table in tables: + doctype = get_doctype_name(table) + stats = _fetch_table_stats(doctype, columns=tablewise_columns[table]) + if not stats: + return + db_table = DBTable.from_frappe_ouput(stats) + optimizer.update_table_data(db_table) + + return optimizer.suggest_index() + + +def _fetch_table_stats(doctype: str, columns: list[str]) -> dict | None: + def sql_bool(val): + return cstr(val).lower() in ("yes", "1", "true") + + if not frappe.db.table_exists(doctype): + return + + table = get_table_name(doctype, wrap_in_backticks=True) + + schema = [] + for field in frappe.db.sql(f"describe {table}", as_dict=True): + schema.append( + { + "column": field["Field"], + "type": field["Type"], + "is_nullable": sql_bool(field["Null"]), + "default": field["Default"], + } + ) + + def update_cardinality(column, value): + for col in schema: + if col["column"] == column: + col["cardinality"] = value + break + + indexes = [] + for idx in frappe.db.sql(f"show index from {table}", as_dict=True): + indexes.append( + { + "unique": not sql_bool(idx["Non_unique"]), + "cardinality": idx["Cardinality"], + "name": idx["Key_name"], + "sequence": idx["Seq_in_index"], + "nullable": sql_bool(idx["Null"]), + "column": idx["Column_name"], + "type": idx["Index_type"], + } + ) + if idx["Seq_in_index"] == 1: + update_cardinality(idx["Column_name"], idx["Cardinality"]) + + total_rows = cint( + frappe.db.sql( + f"""select table_rows + from information_schema.tables + where table_name = 'tab{doctype}'""" + )[0][0] + ) + + # fetch accurate cardinality for columns by query. WARN: This can take A LOT of time. + for column in columns: + cardinality = _get_column_cardinality(table, column) + update_cardinality(column, cardinality) + + return { + "table_name": table.strip("`"), + "total_rows": total_rows, + "schema": schema, + "indexes": indexes, + } + + +@redis_cache +def _get_column_cardinality(table, column): + return frappe.db.sql(f"select count(distinct {column}) from {table}")[0][0] + + +def get_doctype_name(table_name: str) -> str: + return table_name.removeprefix("tab") diff --git a/frappe/core/doctype/recorder_suggested_index/__init__.py b/frappe/core/doctype/recorder_suggested_index/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/frappe/core/doctype/recorder_suggested_index/recorder_suggested_index.json b/frappe/core/doctype/recorder_suggested_index/recorder_suggested_index.json new file mode 100644 index 0000000000..1a265f347b --- /dev/null +++ b/frappe/core/doctype/recorder_suggested_index/recorder_suggested_index.json @@ -0,0 +1,46 @@ +{ + "actions": [], + "allow_rename": 1, + "creation": "2024-05-14 15:03:46.138438", + "doctype": "DocType", + "editable_grid": 1, + "engine": "InnoDB", + "field_order": [ + "table", + "column", + "add_index" + ], + "fields": [ + { + "fieldname": "table", + "fieldtype": "Data", + "in_list_view": 1, + "label": "Table" + }, + { + "fieldname": "column", + "fieldtype": "Data", + "in_list_view": 1, + "label": "Column" + }, + { + "columns": 2, + "fieldname": "add_index", + "fieldtype": "Button", + "label": "Add Index" + } + ], + "index_web_pages_for_search": 1, + "is_virtual": 1, + "istable": 1, + "links": [], + "modified": "2024-05-14 15:18:51.371808", + "modified_by": "Administrator", + "module": "Core", + "name": "Recorder Suggested Index", + "owner": "Administrator", + "permissions": [], + "sort_field": "creation", + "sort_order": "DESC", + "states": [] +} \ No newline at end of file diff --git a/frappe/core/doctype/recorder_suggested_index/recorder_suggested_index.py b/frappe/core/doctype/recorder_suggested_index/recorder_suggested_index.py new file mode 100644 index 0000000000..50e634174c --- /dev/null +++ b/frappe/core/doctype/recorder_suggested_index/recorder_suggested_index.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, Frappe Technologies and contributors +# For license information, please see license.txt + +# import frappe +from frappe.model.document import Document + + +class RecorderSuggestedIndex(Document): + # begin: auto-generated types + # This code is auto-generated. Do not modify anything in this block. + + from typing import TYPE_CHECKING + + if TYPE_CHECKING: + from frappe.types import DF + + column: DF.Data | None + parent: DF.Data + parentfield: DF.Data + parenttype: DF.Data + table: DF.Data | None + # end: auto-generated types + + def db_insert(self, *args, **kwargs): + raise NotImplementedError + + def load_from_db(self): + raise NotImplementedError + + def db_update(self): + raise NotImplementedError + + def delete(self): + raise NotImplementedError + + @staticmethod + def get_list(filters=None, page_length=20, **kwargs): + pass + + @staticmethod + def get_count(filters=None, **kwargs): + pass + + @staticmethod + def get_stats(**kwargs): + pass