diff options
author | Conrad Meyer <cem@FreeBSD.org> | 2019-04-13 04:42:17 +0000 |
---|---|---|
committer | Conrad Meyer <cem@FreeBSD.org> | 2019-04-13 04:42:17 +0000 |
commit | f20b149b45198bd016e669f60758e05c24a332ad (patch) | |
tree | 574cb1668a31c9c17da36af88efefd6dd0313c42 /usr.bin/sort | |
parent | 49d9a5978306d6856325808653eec1ceac21263c (diff) | |
download | src-test-f20b149b45198bd016e669f60758e05c24a332ad.tar.gz src-test-f20b149b45198bd016e669f60758e05c24a332ad.zip |
sort(1): Memoize MD5 computation to reduce repeated computation
Experimentally, reduces sort -R time of a 148160 line corpus from about
3.15s to about 0.93s on this particular system.
There's probably room for improvement using some digest other than md5, but
I don't want to look at sort(1) anymore. Some discussion of other possible
improvements in the Test Plan section of the Differential.
PR: 230792
Reviewed by: jhb (earlier version)
Differential Revision: https://reviews.freebsd.org/D19885
Notes
Notes:
svn path=/head/; revision=346175
Diffstat (limited to 'usr.bin/sort')
-rw-r--r-- | usr.bin/sort/coll.c | 23 | ||||
-rw-r--r-- | usr.bin/sort/coll.h | 12 | ||||
-rw-r--r-- | usr.bin/sort/sort.c | 1 |
3 files changed, 36 insertions, 0 deletions
diff --git a/usr.bin/sort/coll.c b/usr.bin/sort/coll.c index 979ba0d2c0ea7..ccd73e19d393a 100644 --- a/usr.bin/sort/coll.c +++ b/usr.bin/sort/coll.c @@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) return (numcoll_impl(kv1, kv2, offset, true)); } +/* Use hint space to memoize md5 computations, at least. */ +static void +randomcoll_init_hint(struct key_value *kv, void *hash) +{ + + memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached)); + kv->hint->status = HS_INITIALIZED; +} + /* * Implements random sort (-R). */ @@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv2, struct bwstring *s1, *s2; MD5_CTX ctx1, ctx2; unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH]; + int cmp; s1 = kv1->k; s2 = kv2->k; @@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv2, if (s1 == s2) return (0); + if (kv1->hint->status == HS_INITIALIZED && + kv2->hint->status == HS_INITIALIZED) { + cmp = memcmp(kv1->hint->v.Rh.cached, + kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached)); + if (cmp != 0) + return (cmp); + } + memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX)); memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX)); @@ -1012,6 +1030,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv2, MD5Final(hash1, &ctx1); MD5Final(hash2, &ctx2); + if (kv1->hint->status == HS_UNINITIALIZED) + randomcoll_init_hint(kv1, hash1); + if (kv2->hint->status == HS_UNINITIALIZED) + randomcoll_init_hint(kv2, hash2); + return (memcmp(hash1, hash2, sizeof(hash1))); } diff --git a/usr.bin/sort/coll.h b/usr.bin/sort/coll.h index 10505a423bebc..e89c9f8233101 100644 --- a/usr.bin/sort/coll.h +++ b/usr.bin/sort/coll.h @@ -65,6 +65,17 @@ struct M_hint }; /* + * Sort hint data for -R + * + * This stores the first 12 bytes of the digest rather than the full output to + * avoid increasing the size of the 'key_hint' object via the 'v' union. + */ +struct R_hint +{ + unsigned char cached[12]; +}; + +/* * Status of a sort hint object */ typedef enum @@ -83,6 +94,7 @@ struct key_hint struct n_hint nh; struct g_hint gh; struct M_hint Mh; + struct R_hint Rh; } v; }; diff --git a/usr.bin/sort/sort.c b/usr.bin/sort/sort.c index 844841c50458b..fee6f72449e25 100644 --- a/usr.bin/sort/sort.c +++ b/usr.bin/sort/sort.c @@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c) break; case 'R': sm->Rflag = true; + need_hint = true; need_random = true; break; case 'M': |