perf: Improve import template performance by 10 times

Removed nested forloop to get the performace and also using generators to reduce memory usage.
2021-03-11 09:42:54 +05:30 · 2021-03-11 09:42:54 +05:30 · a62cc40885
commit a62cc40885
parent 7eee5c1a35
2 changed files with 39 additions and 14 deletions
--- a/frappe/core/doctype/data_import/exporter.py
+++ b/frappe/core/doctype/data_import/exporter.py
@ -2,13 +2,15 @@
 # Copyright (c) 2019, Frappe Technologies Pvt. Ltd. and Contributors
 # MIT License. See license.txt

+import typing
+
 import frappe
 from frappe.model import (
 	display_fieldtypes,
 	no_value_fields,
 	table_fields as table_fieldtypes,
 )
-from frappe.utils import flt, format_duration
+from frappe.utils import flt, format_duration, groupby_metric
 from frappe.utils.csvutils import build_csv_response
 from frappe.utils.xlsxutils import build_xlsx_response

@ -116,7 +118,6 @@ class Exporter:

 	def get_data_to_export(self):
 		frappe.permissions.can_export(self.doctype, raise_exception=True)
-		data_to_export = []

 		table_fields = [f for f in self.exportable_fields if f != self.doctype]
 		data = self.get_data_as_docs()
@ -128,14 +129,13 @@ class Exporter:
 			if table_fields:
 				# add child table data
 				for f in table_fields:
-					for i, child_row in enumerate(doc[f]):
+					for i, child_row in enumerate(doc.get(f, [])):
 						table_df = self.meta.get_field(f)
 						child_doctype = table_df.options
 						rows = self.add_data_row(child_doctype, child_row.parentfield, child_row, rows, i)

-			data_to_export += rows
-
-		return data_to_export
+			for row in rows:
+				yield row

 	def add_data_row(self, doctype, parentfield, doc, rows, row_idx):
 		if len(rows) < row_idx + 1:
@ -204,17 +204,13 @@ class Exporter:
 			)
 			child_data[key] = data

-		return self.merge_data(parent_data, child_data)
-
-	def merge_data(self, parent_data, child_data):
+		# Group children data by parent name
+		grouped_children_data = self.group_children_data_by_parent(child_data)
 		for doc in parent_data:
-			for table_field, table_rows in child_data.items():
-				doc[table_field] = [row for row in table_rows if row.parent == doc.name]
-
-		return parent_data
+			related_children_docs = grouped_children_data.get(doc.name, {})
+			yield {**doc, **related_children_docs}

 	def add_header(self):
-
 		header = []
 		for df in self.fields:
 			is_parent = not df.is_child_table_field
@ -261,3 +257,6 @@ class Exporter:

 	def build_xlsx_response(self):
 		build_xlsx_response(self.get_csv_array_for_export(), self.doctype)
+
+	def group_children_data_by_parent(self, children_data: typing.Dict[str, list]):
+		return groupby_metric(children_data, key='parent')
--- a/frappe/utils/init.py
+++ b/frappe/utils/init.py
@ -11,6 +11,7 @@ import os
 import re
 import sys
 import traceback
+import typing

 from email.header import decode_header, make_header
 from email.utils import formataddr, parseaddr
@ -763,3 +764,28 @@ def get_bench_relative_path(file_path):
 		sys.exit(1)

 	return os.path.abspath(file_path)
+
+
+def groupby_metric(iterable: typing.Dict[str, list], key: str):
+	""" Group records by a metric.
+
+	Usecase: Lets assume we got country wise players list with the ranking given for each player(multiple players in a country can have same ranking aswell).
+	We can group the players by ranking(can be any other metric) using this function.
+
+	>>> d = {
+		'india': [{'id':1, 'name': 'iplayer-1', 'ranking': 1}, {'id': 2, 'ranking': 1, 'name': 'iplayer-2'}, {'id': 2, 'ranking': 2, 'name': 'iplayer-3'}],
+		'Aus': [{'id':1, 'name': 'aplayer-1', 'ranking': 1}, {'id': 2, 'ranking': 1, 'name': 'aplayer-2'}, {'id': 2, 'ranking': 2, 'name': 'aplayer-3'}]
+	}
+	>>> groupby(d, key='ranking')
+	{1: {'Aus': [{'id': 1, 'name': 'aplayer-1', 'ranking': 1},
+				{'id': 2, 'name': 'aplayer-2', 'ranking': 1}],
+		'india': [{'id': 1, 'name': 'iplayer-1', 'ranking': 1},
+				{'id': 2, 'name': 'iplayer-2', 'ranking': 1}]},
+	2: {'Aus': [{'id': 2, 'name': 'aplayer-3', 'ranking': 2}],
+		'india': [{'id': 2, 'name': 'iplayer-3', 'ranking': 2}]}}
+	"""
+	records = {}
+	for category, items in iterable.items():
+		for item in items:
+			records.setdefault(item[key], {}).setdefault(category, []).append(item)
+	return records