summaryrefslogtreecommitdiff
path: root/usr.bin/sort
diff options
context:
space:
mode:
authorConrad Meyer <cem@FreeBSD.org>2019-04-13 04:42:17 +0000
committerConrad Meyer <cem@FreeBSD.org>2019-04-13 04:42:17 +0000
commitf20b149b45198bd016e669f60758e05c24a332ad (patch)
tree574cb1668a31c9c17da36af88efefd6dd0313c42 /usr.bin/sort
parent49d9a5978306d6856325808653eec1ceac21263c (diff)
downloadsrc-test-f20b149b45198bd016e669f60758e05c24a332ad.tar.gz
src-test-f20b149b45198bd016e669f60758e05c24a332ad.zip
sort(1): Memoize MD5 computation to reduce repeated computation
Experimentally, reduces sort -R time of a 148160 line corpus from about 3.15s to about 0.93s on this particular system. There's probably room for improvement using some digest other than md5, but I don't want to look at sort(1) anymore. Some discussion of other possible improvements in the Test Plan section of the Differential. PR: 230792 Reviewed by: jhb (earlier version) Differential Revision: https://reviews.freebsd.org/D19885
Notes
Notes: svn path=/head/; revision=346175
Diffstat (limited to 'usr.bin/sort')
-rw-r--r--usr.bin/sort/coll.c23
-rw-r--r--usr.bin/sort/coll.h12
-rw-r--r--usr.bin/sort/sort.c1
3 files changed, 36 insertions, 0 deletions
diff --git a/usr.bin/sort/coll.c b/usr.bin/sort/coll.c
index 979ba0d2c0ea7..ccd73e19d393a 100644
--- a/usr.bin/sort/coll.c
+++ b/usr.bin/sort/coll.c
@@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset)
return (numcoll_impl(kv1, kv2, offset, true));
}
+/* Use hint space to memoize md5 computations, at least. */
+static void
+randomcoll_init_hint(struct key_value *kv, void *hash)
+{
+
+ memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
+ kv->hint->status = HS_INITIALIZED;
+}
+
/*
* Implements random sort (-R).
*/
@@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
struct bwstring *s1, *s2;
MD5_CTX ctx1, ctx2;
unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
+ int cmp;
s1 = kv1->k;
s2 = kv2->k;
@@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
if (s1 == s2)
return (0);
+ if (kv1->hint->status == HS_INITIALIZED &&
+ kv2->hint->status == HS_INITIALIZED) {
+ cmp = memcmp(kv1->hint->v.Rh.cached,
+ kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
+ if (cmp != 0)
+ return (cmp);
+ }
+
memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
@@ -1012,6 +1030,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
MD5Final(hash1, &ctx1);
MD5Final(hash2, &ctx2);
+ if (kv1->hint->status == HS_UNINITIALIZED)
+ randomcoll_init_hint(kv1, hash1);
+ if (kv2->hint->status == HS_UNINITIALIZED)
+ randomcoll_init_hint(kv2, hash2);
+
return (memcmp(hash1, hash2, sizeof(hash1)));
}
diff --git a/usr.bin/sort/coll.h b/usr.bin/sort/coll.h
index 10505a423bebc..e89c9f8233101 100644
--- a/usr.bin/sort/coll.h
+++ b/usr.bin/sort/coll.h
@@ -65,6 +65,17 @@ struct M_hint
};
/*
+ * Sort hint data for -R
+ *
+ * This stores the first 12 bytes of the digest rather than the full output to
+ * avoid increasing the size of the 'key_hint' object via the 'v' union.
+ */
+struct R_hint
+{
+ unsigned char cached[12];
+};
+
+/*
* Status of a sort hint object
*/
typedef enum
@@ -83,6 +94,7 @@ struct key_hint
struct n_hint nh;
struct g_hint gh;
struct M_hint Mh;
+ struct R_hint Rh;
} v;
};
diff --git a/usr.bin/sort/sort.c b/usr.bin/sort/sort.c
index 844841c50458b..fee6f72449e25 100644
--- a/usr.bin/sort/sort.c
+++ b/usr.bin/sort/sort.c
@@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
break;
case 'R':
sm->Rflag = true;
+ need_hint = true;
need_random = true;
break;
case 'M':