perf: Improve import template performance by 10 times

Removed nested forloop to get the performace and also using generators
to reduce memory usage.
This commit is contained in:
leela 2021-03-11 09:42:54 +05:30
parent 7eee5c1a35
commit a62cc40885
2 changed files with 39 additions and 14 deletions

View file

@ -2,13 +2,15 @@
# Copyright (c) 2019, Frappe Technologies Pvt. Ltd. and Contributors
# MIT License. See license.txt
import typing
import frappe
from frappe.model import (
display_fieldtypes,
no_value_fields,
table_fields as table_fieldtypes,
)
from frappe.utils import flt, format_duration
from frappe.utils import flt, format_duration, groupby_metric
from frappe.utils.csvutils import build_csv_response
from frappe.utils.xlsxutils import build_xlsx_response
@ -116,7 +118,6 @@ class Exporter:
def get_data_to_export(self):
frappe.permissions.can_export(self.doctype, raise_exception=True)
data_to_export = []
table_fields = [f for f in self.exportable_fields if f != self.doctype]
data = self.get_data_as_docs()
@ -128,14 +129,13 @@ class Exporter:
if table_fields:
# add child table data
for f in table_fields:
for i, child_row in enumerate(doc[f]):
for i, child_row in enumerate(doc.get(f, [])):
table_df = self.meta.get_field(f)
child_doctype = table_df.options
rows = self.add_data_row(child_doctype, child_row.parentfield, child_row, rows, i)
data_to_export += rows
return data_to_export
for row in rows:
yield row
def add_data_row(self, doctype, parentfield, doc, rows, row_idx):
if len(rows) < row_idx + 1:
@ -204,17 +204,13 @@ class Exporter:
)
child_data[key] = data
return self.merge_data(parent_data, child_data)
def merge_data(self, parent_data, child_data):
# Group children data by parent name
grouped_children_data = self.group_children_data_by_parent(child_data)
for doc in parent_data:
for table_field, table_rows in child_data.items():
doc[table_field] = [row for row in table_rows if row.parent == doc.name]
return parent_data
related_children_docs = grouped_children_data.get(doc.name, {})
yield {**doc, **related_children_docs}
def add_header(self):
header = []
for df in self.fields:
is_parent = not df.is_child_table_field
@ -261,3 +257,6 @@ class Exporter:
def build_xlsx_response(self):
build_xlsx_response(self.get_csv_array_for_export(), self.doctype)
def group_children_data_by_parent(self, children_data: typing.Dict[str, list]):
return groupby_metric(children_data, key='parent')

View file

@ -11,6 +11,7 @@ import os
import re
import sys
import traceback
import typing
from email.header import decode_header, make_header
from email.utils import formataddr, parseaddr
@ -763,3 +764,28 @@ def get_bench_relative_path(file_path):
sys.exit(1)
return os.path.abspath(file_path)
def groupby_metric(iterable: typing.Dict[str, list], key: str):
""" Group records by a metric.
Usecase: Lets assume we got country wise players list with the ranking given for each player(multiple players in a country can have same ranking aswell).
We can group the players by ranking(can be any other metric) using this function.
>>> d = {
'india': [{'id':1, 'name': 'iplayer-1', 'ranking': 1}, {'id': 2, 'ranking': 1, 'name': 'iplayer-2'}, {'id': 2, 'ranking': 2, 'name': 'iplayer-3'}],
'Aus': [{'id':1, 'name': 'aplayer-1', 'ranking': 1}, {'id': 2, 'ranking': 1, 'name': 'aplayer-2'}, {'id': 2, 'ranking': 2, 'name': 'aplayer-3'}]
}
>>> groupby(d, key='ranking')
{1: {'Aus': [{'id': 1, 'name': 'aplayer-1', 'ranking': 1},
{'id': 2, 'name': 'aplayer-2', 'ranking': 1}],
'india': [{'id': 1, 'name': 'iplayer-1', 'ranking': 1},
{'id': 2, 'name': 'iplayer-2', 'ranking': 1}]},
2: {'Aus': [{'id': 2, 'name': 'aplayer-3', 'ranking': 2}],
'india': [{'id': 2, 'name': 'iplayer-3', 'ranking': 2}]}}
"""
records = {}
for category, items in iterable.items():
for item in items:
records.setdefault(item[key], {}).setdefault(category, []).append(item)
return records