perf: "random" naming to improve concurrency and locality (#30053)
This feels overengineered and it kinda is, but other efforts to inroduce sequential naming/UUID naming haven't been that fruitful either. 10 character random "hash" i now changed to. 1. first character - last character in UUID4 ID of request/job 2. three characters - derived from current timestamp. 4. 6 characters - random data. This satisfies all three requirements: 1. Readers - temporal locality should result in spatial locality on disk. (fewer pages accessed) 2. Single writer - temporal locality should result in spatial locality. (fewer dirty pages) 3. Multiple writers - temporal locality should NOT result in spatial locality. (less lock contention) Mostly concludes https://github.com/frappe/frappe/pull/25309 and https://github.com/frappe/frappe/pull/28349 Rough probabiliy numbers Assumptions: - Unique per worker prefix - 16 (uuid's base16 version) - Rough time spent generating names - 10% of request (very very conservative estimate) Probability(collision) = P(at least one prefix collision) * P(time collision) Probability(collision) = (1 - p(all different)) * 10% Probability(collision) = (1 - (16! / 16-N! )/ 16^N ) * 10% | N (concurrency) | Probability(collision) | | 1 | 0.0% | | 2 | 0.6% | | 3 | 1.8% | | 4 | 3.3% | | 5 | 5.0% | | 6 | 6.6% | | 7 | 7.9% | | 8 | 8.8% |
This commit is contained in:
parent
473ff81a31
commit
9b79dfeb7b
3 changed files with 11 additions and 5 deletions
|
|
@ -14,6 +14,7 @@ import uuid_utils
|
|||
import frappe
|
||||
from frappe import _
|
||||
from frappe.model import log_types
|
||||
from frappe.monitor import get_trace_id
|
||||
from frappe.query_builder import DocType
|
||||
from frappe.utils import cint, cstr, now_datetime
|
||||
|
||||
|
|
@ -281,7 +282,7 @@ def make_autoname(key="", doctype="", doc="", *, ignore_validate=False):
|
|||
DE/09/01/00001 where 09 is the year, 01 is the month and 00001 is the series
|
||||
"""
|
||||
if key == "hash":
|
||||
return _generate_random_string(10)
|
||||
return (_get_timestamp_prefix() + _generate_random_string(7))[:10]
|
||||
|
||||
series = NamingSeries(key)
|
||||
return series.generate_next_name(doc, ignore_validate=ignore_validate)
|
||||
|
|
@ -291,7 +292,14 @@ def _get_timestamp_prefix():
|
|||
ts = int(time.time() * 10) # time in deciseconds
|
||||
# we ~~don't need~~ can't get ordering over entire lifetime, so we wrap the time.
|
||||
ts = ts % (32**4)
|
||||
return base64.b32hexencode(ts.to_bytes(length=5, byteorder="big")).decode()[-4:].lower()
|
||||
ts_part = base64.b32hexencode(ts.to_bytes(length=5, byteorder="big")).decode()[-3:].lower()
|
||||
|
||||
# First character is from request/job specific UUID, all documents created in this "session" will
|
||||
# have same prefix. This avoids collision between parallel jobs with reasonable probabililistic
|
||||
# guarantees.
|
||||
request_part = (get_trace_id() or "")[-1:]
|
||||
|
||||
return request_part + ts_part
|
||||
|
||||
|
||||
def _generate_random_string(length=10):
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class Monitor:
|
|||
self.data.job.scheduled = True
|
||||
|
||||
if job := rq.get_current_job():
|
||||
self.data.uuid = job.id
|
||||
self.data.job_id = job.id
|
||||
waitdiff = self.data.timestamp - job.enqueued_at.replace(tzinfo=datetime.timezone.utc)
|
||||
self.data.job.wait = int(waitdiff.total_seconds() * 1000000)
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
# License: MIT. See LICENSE
|
||||
|
||||
import time
|
||||
import unittest
|
||||
from uuid import UUID
|
||||
|
||||
import uuid_utils
|
||||
|
|
@ -407,7 +406,6 @@ class TestNaming(IntegrationTestCase):
|
|||
expected_name = "TODO-" + nowdate().split("-")[1] + "-" + "0001"
|
||||
self.assertEqual(name, expected_name)
|
||||
|
||||
@unittest.skip("This is not supported anymore, see #28349.")
|
||||
@retry(
|
||||
retry=retry_if_exception_type(AssertionError),
|
||||
stop=stop_after_attempt(3),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue