summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Jeffery <andrew@aj.id.au>2015-08-20 15:46:31 +0930
committerRusty Russell <rusty@rustcorp.com.au>2015-09-09 04:31:39 +0930
commit7c7ad8cc84eb75d83222fb269d9c117c12c8e75c (patch)
tree0aacd9aa6abf762496fe29d2bb1b3462f3ce6ba7
parent4ad5144790a12523f8a7c24c469a34907b6942a6 (diff)
strgrp: new module
-rw-r--r--Makefile-ccan1
l---------ccan/strgrp/LICENSE1
-rw-r--r--ccan/strgrp/_info107
-rw-r--r--ccan/strgrp/strgrp.c366
-rw-r--r--ccan/strgrp/strgrp.h186
-rw-r--r--ccan/strgrp/test/api_all_similar_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_almost_different_25_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_almost_different_50_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_almost_different_75_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_almost_similar_25_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_almost_similar_50_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_almost_similar_75_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_exact_match_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_illegal_none_similar_insert_two.c7
-rw-r--r--ccan/strgrp/test/api_insert_duplicate.c7
-rw-r--r--ccan/strgrp/test/api_insert_one.c23
-rw-r--r--ccan/strgrp/test/api_insert_two_distinct.c7
-rw-r--r--ccan/strgrp/test/api_iterate_no_groups.c14
-rw-r--r--ccan/strgrp/test/api_iterate_one_group.c16
-rw-r--r--ccan/strgrp/test/api_iterate_two_groups.c18
-rw-r--r--ccan/strgrp/test/api_new_free.c11
-rw-r--r--ccan/strgrp/test/api_test_free_cb.c16
-rw-r--r--ccan/strgrp/test/api_test_print.c15
-rw-r--r--ccan/strgrp/test/helpers.c67
-rw-r--r--ccan/strgrp/test/helpers.h29
25 files changed, 947 insertions, 0 deletions
diff --git a/Makefile-ccan b/Makefile-ccan
index 8d13a4f5..fb7e4c9b 100644
--- a/Makefile-ccan
+++ b/Makefile-ccan
@@ -102,6 +102,7 @@ MODS_WITH_SRC := aga \
str/hex \
stringbuilder \
stringmap \
+ strgrp \
strmap \
strset \
take \
diff --git a/ccan/strgrp/LICENSE b/ccan/strgrp/LICENSE
new file mode 120000
index 00000000..74550445
--- /dev/null
+++ b/ccan/strgrp/LICENSE
@@ -0,0 +1 @@
+../../licenses/LGPL-3 \ No newline at end of file
diff --git a/ccan/strgrp/_info b/ccan/strgrp/_info
new file mode 100644
index 00000000..fcf2fb26
--- /dev/null
+++ b/ccan/strgrp/_info
@@ -0,0 +1,107 @@
+#include "config.h"
+#include <stdio.h>
+#include <string.h>
+
+/**
+ * strgrp - group/cluster similar strings.
+ *
+ * strgrp uses the Longest Common Subsequence (LCS) algorithm[1] to cluster
+ * similar strings. It is governed by a threshold which is compared against
+ * the computed normalised LCS length for all known groups.
+ *
+ * As a coarse and not entirely accurate summary, strgrp takes the following
+ * steps:
+ *
+ * 1. For all known strings, calculate the normalised LCS value against the
+ * input string
+ * 2. Find the maximum normalised LCS value and associated group
+ * 3. If the calculated normalised LCS value exceeds the configured threshold,
+ * add the input string to the group, otherwise create a new group
+ *
+ * The clustering operation is expensive; LCS on its own is computationally
+ * O(mn) on its two input strings and optimally requires O(min(m,n)) memory. In
+ * general each input string should be compared against all known strings,
+ * giving O(n^2) behaviour of the clustering algorithm on top of the O(mn) LCS
+ * similarity measurement.
+ *
+ * strgrp tries to battle this complexity on several fronts:
+ *
+ * 1. Coarse reduction of the required comparisons
+ * 1a. Each group has a 'key', which is the string that triggered the creation
+ * of the group
+ * 1b. Input strings are only compared against group keys rather than all known
+ * strings, reducing the complexity to the current number of groups rather
+ * than all known strings. Note due the pathological case where the number
+ * of groups is equal to the number of known strings the algorithm still
+ * has O(n^2) computational complexity
+ *
+ * 2. Elimination of LCS computations that will never breach the configured
+ * threshold
+ * 2a. This can be measured from the length of the compared strings
+ * 2b. This avoids invoking the O(mn) behaviour of LCS
+ *
+ * 3. Caching of input strings and their associated group
+ * 3a. By incurring the cost of a map's string hash function we may eliminate
+ * all calls to the LCS function for exact matches, potentially reducing
+ * the insertion to a constant-time operation.
+ *
+ * 4. Whilst the data dependencies of LCS prevent internally parallel
+ * implementations, LCS as a function can be applied in parallel. The code
+ * uses OpenMP to automatically and concurrently distribute scoring of the
+ * input string against group keys across threads.
+ *
+ * [1] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+ *
+ * License: LGPL
+ * Author: Andrew Jeffery <andrew@aj.id.au>
+ *
+ * Example:
+ * #define BUF_SIZE 512
+ * FILE *const f = fdopen(0, "r");
+ * char *buf = malloc(BUF_SIZE);
+ * struct strgrp *ctx = strgrp_new(0.85);
+ * while(fgets(buf, BUF_SIZE, f)) {
+ * buf[strcspn(buf, "\r\n")] = '\0';
+ * if (!strgrp_add(ctx, buf, NULL)) {
+ * printf("Failed to classify %s\n", buf);
+ * }
+ * }
+ *
+ * // Re-implement something similar to strgrp_print() via API as an
+ * // example
+ * struct strgrp_iter *iter = strgrp_iter_new(ctx);
+ * const struct strgrp_grp *grp;
+ * while(NULL != (grp = strgrp_iter_next(iter))) {
+ * printf("%s\n", strgrp_grp_key(grp));
+ * struct strgrp_grp_iter *grp_iter = strgrp_grp_iter_new(grp);
+ * const struct strgrp_item *item;
+ * while(NULL != (item = strgrp_grp_iter_next(grp_iter))) {
+ * printf("\t%s\n", strgrp_item_key(item));
+ * }
+ * strgrp_grp_iter_free(grp_iter);
+ * }
+ * strgrp_iter_free(iter);
+ * strgrp_free(ctx);
+ * free(buf);
+ * fclose(f);
+ */
+int main(int argc, char *argv[]) {
+ if (argc != 2) {
+ return 1;
+ }
+
+ if (strcmp(argv[1], "depends") == 0) {
+ printf("ccan/darray\n");
+ printf("ccan/stringmap\n");
+ printf("ccan/tal\n");
+ printf("ccan/tal/str\n");
+ return 0;
+ }
+
+ if (strcmp(argv[1], "testdepends") == 0) {
+ printf("ccan/str\n");
+ return 0;
+ }
+
+ return 1;
+}
diff --git a/ccan/strgrp/strgrp.c b/ccan/strgrp/strgrp.c
new file mode 100644
index 00000000..1f3d2822
--- /dev/null
+++ b/ccan/strgrp/strgrp.c
@@ -0,0 +1,366 @@
+/*
+ Group similar strings
+ Copyright (C) 2014 Andrew Jeffery <andrew@aj.id.au>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "ccan/darray/darray.h"
+#include "ccan/stringmap/stringmap.h"
+#include "ccan/tal/tal.h"
+#include "ccan/tal/str/str.h"
+#include "strgrp.h"
+
+typedef darray(struct strgrp_grp *) darray_grp;
+typedef darray(struct strgrp_item *) darray_item;
+
+typedef stringmap(struct strgrp_grp *) stringmap_grp;
+
+struct grp_score {
+ struct strgrp_grp *grp;
+ double score;
+};
+
+typedef darray(struct grp_score *) darray_score;
+
+struct strgrp {
+ double threshold;
+ stringmap_grp known;
+ unsigned int n_grps;
+ darray_grp grps;
+ struct grp_score *scores;
+};
+
+struct strgrp_iter {
+ const struct strgrp *ctx;
+ int i;
+};
+
+struct strgrp_grp {
+ const char *key;
+ size_t key_len;
+ darray_item items;
+ int32_t n_items;
+};
+
+struct strgrp_grp_iter {
+ const struct strgrp_grp *grp;
+ int i;
+};
+
+struct strgrp_item {
+ const char *key;
+ void *value;
+};
+
+#define ROWS 2
+
+static inline int cmi(int i, int j) {
+ return ROWS * j + i;
+}
+
+static inline int16_t
+lcs(const char *const a, const char *const b) {
+ const int lb = strlen(b);
+ const int lbp1 = lb + 1;
+ int16_t *const lookup = calloc(ROWS * lbp1, sizeof(int16_t));
+ if (!lookup) {
+ return -1;
+ }
+ int ia, ib;
+ for (ia = (strlen(a) - 1); ia >= 0; ia--) {
+ const char iav = a[ia];
+ for (ib = lb - 1; ib >= 0; ib--) {
+ const char ibv = b[ib];
+ const int ial = (ia + 1) & 1; // ia last
+ const int iac = ia & 1; // ia current
+ const int ibl = ib + 1; // ib last
+ // don't need separate "ib current" as it's just ib
+ if (iav == ibv) {
+ lookup[cmi(iac, ib)] = 1 + lookup[cmi(ial, ibl)];
+ } else {
+ const int16_t valb = lookup[cmi(ial, ib)];
+ const int16_t vabl = lookup[cmi(iac, ibl)];
+ lookup[cmi(iac, ib)] = (valb > vabl) ? valb : vabl;
+ }
+ }
+ }
+ int16_t result = lookup[0];
+ free(lookup);
+ return result;
+}
+
+#undef ROWS
+
+static inline double
+nlcs(const char *const a, const char *const b) {
+ const double lcss = lcs(a, b);
+ return 2 * lcss / (strlen(a) + strlen(b));
+}
+
+static bool
+should_grp_score(const struct strgrp *const ctx,
+ const struct strgrp_grp *const grp, const char *const str) {
+ const size_t strl = strlen(str);
+ const size_t keyl = grp->key_len;
+ double sr = strl / keyl;
+ if (1 < sr) {
+ sr = 1 / sr;
+ }
+ return ctx->threshold <= sr;
+}
+
+static inline double
+grp_score(const struct strgrp_grp *const grp, const char *const str) {
+ return nlcs(grp->key, str);
+}
+
+static struct strgrp_item *
+new_item(tal_t *const tctx, const char *const str, void *const data) {
+ struct strgrp_item *i = talz(tctx, struct strgrp_item);
+ if (!i) {
+ return NULL;
+ }
+ i->key = tal_strdup(i, str);
+ i->value = data;
+ return i;
+}
+
+static bool
+add_item(struct strgrp_grp *const ctx, const char *const str,
+ void *const data) {
+ struct strgrp_item *i = new_item(ctx, str, data);
+ if (!i) {
+ return false;
+ }
+ darray_push(ctx->items, i);
+ ctx->n_items++;
+ return true;
+}
+
+static void
+free_grp(struct strgrp_grp *grp) {
+ darray_free(grp->items);
+}
+
+static struct strgrp_grp *
+new_grp(tal_t *const tctx, const char *const str, void *const data) {
+ struct strgrp_grp *b = talz(tctx, struct strgrp_grp);
+ if (!b) {
+ return NULL;
+ }
+ b->key = tal_strdup(b, str);
+ b->key_len = strlen(str);
+ b->n_items = 0;
+ darray_init(b->items);
+ tal_add_destructor(b, free_grp);
+ if (!add_item(b, str, data)) {
+ return tal_free(b);
+ }
+ return b;
+}
+
+static struct strgrp_grp *
+add_grp(struct strgrp *const ctx, const char *const str,
+ void *const data) {
+ struct strgrp_grp *b = new_grp(ctx, str, data);
+ if (!b) {
+ return NULL;
+ }
+ darray_push(ctx->grps, b);
+ ctx->n_grps++;
+ if (ctx->scores) {
+ if (!tal_resize(&ctx->scores, ctx->n_grps)) {
+ return NULL;
+ }
+ } else {
+ ctx->scores = tal_arr(ctx, struct grp_score, ctx->n_grps);
+ if (!ctx->scores) {
+ return NULL;
+ }
+ }
+ return b;
+}
+
+struct strgrp *
+strgrp_new(const double threshold) {
+ struct strgrp *ctx = talz(NULL, struct strgrp);
+ ctx->threshold = threshold;
+ stringmap_init(ctx->known, NULL);
+ darray_init(ctx->grps);
+ return ctx;
+}
+
+static inline void
+cache(struct strgrp *const ctx, struct strgrp_grp *const grp,
+ const char *const str) {
+ *(stringmap_enter(ctx->known, str)) = grp;
+}
+
+static struct strgrp_grp *
+grp_for(struct strgrp *const ctx, const char *const str) {
+ if (!ctx->n_grps) {
+ return NULL;
+ }
+ {
+ struct strgrp_grp **const grp = stringmap_lookup(ctx->known, str);
+ if (grp) {
+ return *grp;
+ }
+ }
+ int i;
+ #pragma omp parallel for schedule(dynamic)
+ for (i = 0; i < ctx->n_grps; i++) {
+ ctx->scores[i].grp = darray_item(ctx->grps, i);
+ const bool ss = should_grp_score(ctx, ctx->scores[i].grp, str);
+ ctx->scores[i].score = ss ? grp_score(ctx->scores[i].grp, str) : 0;
+ }
+ struct grp_score *max = NULL;
+ for (i = 0; i < ctx->n_grps; i++) {
+ if (!max || ctx->scores[i].score > max->score) {
+ max = &(ctx->scores[i]);
+ }
+ }
+ return (max && max->score >= ctx->threshold) ? max->grp : NULL;
+}
+
+const struct strgrp_grp *
+strgrp_grp_for(struct strgrp *const ctx, const char *const str) {
+ return grp_for(ctx, str);
+}
+
+const struct strgrp_grp *
+strgrp_add(struct strgrp *const ctx, const char *const str,
+ void *const data) {
+ bool inserted = false;
+ struct strgrp_grp *pick = grp_for(ctx, str);
+ if (pick) {
+ inserted = add_item(pick, str, data);
+ } else {
+ pick = add_grp(ctx, str, data);
+ inserted = (NULL != pick);
+ }
+ if (inserted) {
+ assert(NULL != pick);
+ cache(ctx, pick, str);
+ }
+ return pick;
+}
+
+struct strgrp_iter *
+strgrp_iter_new(struct strgrp *const ctx) {
+ struct strgrp_iter *iter = talz(ctx, struct strgrp_iter);
+ if (!iter) {
+ return NULL;
+ }
+ iter->ctx = ctx;
+ iter->i = 0;
+ return iter;
+}
+
+const struct strgrp_grp *
+strgrp_iter_next(struct strgrp_iter *const iter) {
+ return (iter->ctx->n_grps == iter->i) ?
+ NULL : darray_item(iter->ctx->grps, iter->i++);
+}
+
+void
+strgrp_iter_free(struct strgrp_iter *const iter) {
+ tal_free(iter);
+}
+
+struct strgrp_grp_iter *
+strgrp_grp_iter_new(const struct strgrp_grp *const grp) {
+ struct strgrp_grp_iter *iter = talz(grp, struct strgrp_grp_iter);
+ if (!iter) {
+ return NULL;
+ }
+ iter->grp = grp;
+ iter->i = 0;
+ return iter;
+}
+
+const struct strgrp_item *
+strgrp_grp_iter_next(struct strgrp_grp_iter *const iter) {
+ return (iter->grp->n_items == iter->i) ?
+ NULL : darray_item(iter->grp->items, iter->i++);
+}
+
+void
+strgrp_grp_iter_free(struct strgrp_grp_iter *iter) {
+ tal_free(iter);
+}
+
+const char *
+strgrp_grp_key(const struct strgrp_grp *const grp) {
+ return grp->key;
+}
+
+const char *
+strgrp_item_key(const struct strgrp_item *const item) {
+ return item->key;
+}
+
+void *
+strgrp_item_value(const struct strgrp_item *const item) {
+ return item->value;
+}
+
+void
+strgrp_free(struct strgrp *const ctx) {
+ darray_free(ctx->grps);
+ stringmap_free(ctx->known);
+ tal_free(ctx);
+}
+
+void
+strgrp_free_cb(struct strgrp *const ctx, void (*cb)(void *data)) {
+ struct strgrp_grp **grp;
+ struct strgrp_item **item;
+ darray_foreach(grp, ctx->grps) {
+ darray_foreach(item, (*grp)->items) {
+ cb((*item)->value);
+ }
+ }
+ strgrp_free(ctx);
+}
+
+#include <stdio.h>
+
+static void
+print_item(const struct strgrp_item *item) {
+ printf("\t%s\n", item->key);
+}
+
+static void
+print_grp(const struct strgrp_grp *const grp) {
+ struct strgrp_item **item;
+ printf("%s:\n", grp->key);
+ darray_foreach(item, grp->items) {
+ print_item(*item);
+ }
+ printf("\n");
+}
+
+void
+strgrp_print(const struct strgrp *const ctx) {
+ struct strgrp_grp **grp;
+ darray_foreach(grp, ctx->grps) {
+ print_grp(*grp);
+ }
+}
diff --git a/ccan/strgrp/strgrp.h b/ccan/strgrp/strgrp.h
new file mode 100644
index 00000000..4f0d57b7
--- /dev/null
+++ b/ccan/strgrp/strgrp.h
@@ -0,0 +1,186 @@
+/* GNU LGPL - see LICENSE file for details */
+
+#ifndef STRGRP_H
+#define STRGRP_H
+#include <stdbool.h>
+
+struct strgrp;
+struct strgrp_iter;
+struct strgrp_grp;
+struct strgrp_grp_iter;
+struct strgrp_item;
+
+/**
+ * Constructs a new strgrp instance.
+ * @threshold: A value in [0.0, 1.0] describing the desired similarity of
+ * strings in a cluster
+ *
+ * @return A heap-allocated strgrp instance, or NULL if initialisation fails.
+ * Ownership of the pointer resides with the caller, which must be freed with
+ * strgrp_free.
+ */
+struct strgrp *
+strgrp_new(double threshold);
+
+/**
+ * Find a group which best matches the provided string key.
+ * @ctx: The strgrp instance to search
+ * @str: The string key to cluster
+ *
+ * The returned group is the group providing the maximum score that is equal to
+ * or above the configured threshold.
+ *
+ * @return A matched group, or NULL if no reasonable group is found. Ownership
+ * of the returned pointer resides with the strgrp instance and it becomes
+ * invalid if the strgrp instance is freed.
+ */
+const struct strgrp_grp *
+strgrp_grp_for(struct strgrp *ctx, const char *str);
+
+/**
+ * Add a string key and arbitrary data value (together, an item) to the
+ * appropriate group.
+ * @ctx: The strgrp instance to add the string and data
+ * @str: The string key used to select a group. The caller retains ownership of
+ * the pointer and may free or change the memory prior to freeing the
+ * strgrp instance.
+ * @data: The data to attach to the group's new entry. The caller retains
+ * ownership of the pointer, but for correctness its lifetime should be at
+ * least equal to the lifetime of the strgrp instance.
+ *
+ * Returns the group to which the item was added. Ownership of the returned
+ * pointer resides with the strgrp instance and it becomes invalid if the
+ * strgrp instance is freed.
+ */
+const struct strgrp_grp *
+strgrp_add(struct strgrp *ctx, const char *str, void *data);
+
+/**
+ * Create an iterator over the current groups.
+ * @ctx: The strgrp instance to iterate over
+ *
+ * @return An iterator structure, or NULL if a failure occurred. Ownership of
+ * the returned pointer is with the strgrp instance. The caller should pass the
+ * pointer strgrp_iter_free when the iterator is exhausted. It is invalid to
+ * call strgrp_iter_next or strgrp_iter_free on the returned pointer after the
+ * strgrp instance has been freed.
+ */
+struct strgrp_iter *
+strgrp_iter_new(struct strgrp *ctx);
+
+/**
+ * Extract the next group from a group iterator
+ * @iter: The iterator in question
+ *
+ * Returns the next group in the iterator or NULL if no further groups exist.
+ * Ownership of the returned pointer resides with the strgrp instance and
+ * becomes invalid if the strgrp instance is freed.
+ */
+const struct strgrp_grp *
+strgrp_iter_next(struct strgrp_iter *iter);
+
+/**
+ * Clean up a group iterator instance
+ * @iter: The iterator to free
+ */
+void
+strgrp_iter_free(struct strgrp_iter *iter);
+
+/**
+ * Extract the key for a group.
+ * @grp: A strgrp_grp pointer
+ *
+ * A group's key is the input string that caused the creation of the group.
+ *
+ * Returns the group key. Ownership of the pointer resides with the grp
+ * parameter and by extension the strgrp instance. The caller must duplicate
+ * the string if the content is required beyond the lifetime of the strgrp
+ * instance.
+ */
+const char *
+strgrp_grp_key(const struct strgrp_grp *grp);
+
+/**
+ * Create an iterator over items in the provided group
+ * @grp: The group whose items to iterate over
+ *
+ * @return An iterator structure, or NULL if a failure occurred. Ownership of
+ * the returned pointer is with the strgrp instance. The caller should pass the
+ * pointer strgrp_grp_iter_free when the iterator is exhausted. It is invalid
+ * to call strgrp_grp_iter_next or strgrp_grp_iter_free on the returned pointer
+ * after the strgrp instance has been freed.
+ */
+struct strgrp_grp_iter *
+strgrp_grp_iter_new(const struct strgrp_grp *grp);
+
+/**
+ * Extract the next item from a item iterator
+ * @iter: The iterator in question
+ *
+ * Returns the next group in the iterator or NULL if no further items exist.
+ * Ownership of the returned pointer resides with the strgrp instance and
+ * becomes invalid if the strgrp instance is freed.
+ */
+const struct strgrp_item *
+strgrp_grp_iter_next(struct strgrp_grp_iter *iter);
+
+/**
+ * Clean up an item iterator instance
+ * @iter: The iterator to free
+ */
+void
+strgrp_grp_iter_free(struct strgrp_grp_iter *iter);
+
+/**
+ * Extract the key for an item
+ *
+ * @item: The item in question
+ *
+ * The key is the string input string which generated the item in the cluster.
+ *
+ * Returns the item key. Ownership of the pointer resides with the item
+ * parameter and by extension the strgrp instance. The caller must duplicate
+ * the string if the content is required beyond the lifetime of the strgrp
+ * instance.
+ */
+const char *
+strgrp_item_key(const struct strgrp_item *item);
+
+/**
+ * Extract the value for an item
+ * @item: The item in question
+ *
+ * The value is the arbitrary pointer associated with the input string
+ *
+ * Returns the item value. The ownership of the pointer does not reside with
+ * the strgrp instance, but for correctness should exceed the lifetime of the
+ * strgrp instance.
+ */
+void *
+strgrp_item_value(const struct strgrp_item *item);
+
+/**
+ * Destroy the strgrp instance
+ *
+ * @ctx: The strgrp instance in question
+ */
+void
+strgrp_free(struct strgrp *ctx);
+
+/**
+ * Destroy the strgrp instance, but not before applying cb() to each item's value element
+ * @ctx: The strgrp instance in question
+ * @cb: The callback to execute against each item's value. This might be used
+ * to free the value data.
+ */
+void
+strgrp_free_cb(struct strgrp *ctx, void (*cb)(void *data));
+
+
+/**
+ * Dump the groupings to stdout.
+ * @ctx: The strgrp instance in question
+ */
+void
+strgrp_print(const struct strgrp *ctx);
+#endif
diff --git a/ccan/strgrp/test/api_all_similar_insert_two.c b/ccan/strgrp/test/api_all_similar_insert_two.c
new file mode 100644
index 00000000..7d68920a
--- /dev/null
+++ b/ccan/strgrp/test/api_all_similar_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0);
+ return one_group_from_two(ctx, "a", NULL, "b", NULL);
+}
diff --git a/ccan/strgrp/test/api_almost_different_25_insert_two.c b/ccan/strgrp/test/api_almost_different_25_insert_two.c
new file mode 100644
index 00000000..3da484bf
--- /dev/null
+++ b/ccan/strgrp/test/api_almost_different_25_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0.25);
+ return one_group_from_two(ctx, "abcde", NULL, "zyxab", NULL);
+}
diff --git a/ccan/strgrp/test/api_almost_different_50_insert_two.c b/ccan/strgrp/test/api_almost_different_50_insert_two.c
new file mode 100644
index 00000000..790930b1
--- /dev/null
+++ b/ccan/strgrp/test/api_almost_different_50_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0.50);
+ return one_group_from_two(ctx, "abcde", NULL, "zyabc", NULL);
+}
diff --git a/ccan/strgrp/test/api_almost_different_75_insert_two.c b/ccan/strgrp/test/api_almost_different_75_insert_two.c
new file mode 100644
index 00000000..dcf01a07
--- /dev/null
+++ b/ccan/strgrp/test/api_almost_different_75_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0.50);
+ return one_group_from_two(ctx, "abcde", NULL, "zabcd", NULL);
+}
diff --git a/ccan/strgrp/test/api_almost_similar_25_insert_two.c b/ccan/strgrp/test/api_almost_similar_25_insert_two.c
new file mode 100644
index 00000000..35687f5a
--- /dev/null
+++ b/ccan/strgrp/test/api_almost_similar_25_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0.25);
+ return two_groups_from_two(ctx, "abcde", NULL, "zyxwa", NULL);
+}
diff --git a/ccan/strgrp/test/api_almost_similar_50_insert_two.c b/ccan/strgrp/test/api_almost_similar_50_insert_two.c
new file mode 100644
index 00000000..62198e7b
--- /dev/null
+++ b/ccan/strgrp/test/api_almost_similar_50_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0.50);
+ return two_groups_from_two(ctx, "abcde", NULL, "zyxab", NULL);
+}
diff --git a/ccan/strgrp/test/api_almost_similar_75_insert_two.c b/ccan/strgrp/test/api_almost_similar_75_insert_two.c
new file mode 100644
index 00000000..6c8553f1
--- /dev/null
+++ b/ccan/strgrp/test/api_almost_similar_75_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 0.75);
+ return two_groups_from_two(ctx, "abcde", NULL, "zyabc", NULL);
+}
diff --git a/ccan/strgrp/test/api_exact_match_insert_two.c b/ccan/strgrp/test/api_exact_match_insert_two.c
new file mode 100644
index 00000000..c44409a4
--- /dev/null
+++ b/ccan/strgrp/test/api_exact_match_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 1.0);
+ return one_group_from_two(ctx, "a", NULL, "a", NULL);
+}
diff --git a/ccan/strgrp/test/api_illegal_none_similar_insert_two.c b/ccan/strgrp/test/api_illegal_none_similar_insert_two.c
new file mode 100644
index 00000000..bc330c5f
--- /dev/null
+++ b/ccan/strgrp/test/api_illegal_none_similar_insert_two.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, 1.1);
+ return one_group_from_two(ctx, "a", NULL, "a", NULL);
+}
diff --git a/ccan/strgrp/test/api_insert_duplicate.c b/ccan/strgrp/test/api_insert_duplicate.c
new file mode 100644
index 00000000..7bf6fb4f
--- /dev/null
+++ b/ccan/strgrp/test/api_insert_duplicate.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, DEFAULT_SIMILARITY);
+ return one_group_from_two(ctx, "a", (void *)"1", "a", (void *)"2");
+}
diff --git a/ccan/strgrp/test/api_insert_one.c b/ccan/strgrp/test/api_insert_one.c
new file mode 100644
index 00000000..68032626
--- /dev/null
+++ b/ccan/strgrp/test/api_insert_one.c
@@ -0,0 +1,23 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ const char k[] = "a";
+ struct strgrp *ctx;
+ const struct strgrp_grp *grp;
+ struct strgrp_grp_iter *iter;
+ const struct strgrp_item *item;
+
+ plan_tests(5);
+ create(ctx, DEFAULT_SIMILARITY);
+ grp = strgrp_add(ctx, k, NULL);
+ ok1(streq(k, strgrp_grp_key(grp)));
+ iter = strgrp_grp_iter_new(grp);
+ item = strgrp_grp_iter_next(iter);
+ ok1(item);
+ ok1(streq(k, strgrp_item_key(item)));
+ ok1(!strgrp_item_value(item));
+ ok1(!strgrp_grp_iter_next(iter));
+ strgrp_grp_iter_free(iter);
+ strgrp_free(ctx);
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/api_insert_two_distinct.c b/ccan/strgrp/test/api_insert_two_distinct.c
new file mode 100644
index 00000000..3ee8e3e5
--- /dev/null
+++ b/ccan/strgrp/test/api_insert_two_distinct.c
@@ -0,0 +1,7 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ create(ctx, DEFAULT_SIMILARITY);
+ return two_groups_from_two(ctx, "a", NULL, "b", NULL);
+}
diff --git a/ccan/strgrp/test/api_iterate_no_groups.c b/ccan/strgrp/test/api_iterate_no_groups.c
new file mode 100644
index 00000000..a7f692b4
--- /dev/null
+++ b/ccan/strgrp/test/api_iterate_no_groups.c
@@ -0,0 +1,14 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ struct strgrp_iter *iter;
+
+ plan_tests(1);
+ create(ctx, DEFAULT_SIMILARITY);
+ iter = strgrp_iter_new(ctx);
+ ok1(!strgrp_iter_next(iter));
+ strgrp_iter_free(iter);
+ strgrp_free(ctx);
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/api_iterate_one_group.c b/ccan/strgrp/test/api_iterate_one_group.c
new file mode 100644
index 00000000..b363c2c1
--- /dev/null
+++ b/ccan/strgrp/test/api_iterate_one_group.c
@@ -0,0 +1,16 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ struct strgrp_iter *iter;
+
+ plan_tests(2);
+ create(ctx, DEFAULT_SIMILARITY);
+ strgrp_add(ctx, "a", NULL);
+ iter = strgrp_iter_new(ctx);
+ ok1(strgrp_iter_next(iter));
+ ok1(!strgrp_iter_next(iter));
+ strgrp_iter_free(iter);
+ strgrp_free(ctx);
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/api_iterate_two_groups.c b/ccan/strgrp/test/api_iterate_two_groups.c
new file mode 100644
index 00000000..ebf81a53
--- /dev/null
+++ b/ccan/strgrp/test/api_iterate_two_groups.c
@@ -0,0 +1,18 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+ struct strgrp_iter *iter;
+
+ plan_tests(3);
+ create(ctx, DEFAULT_SIMILARITY);
+ strgrp_add(ctx, "a", NULL);
+ strgrp_add(ctx, "b", NULL);
+ iter = strgrp_iter_new(ctx);
+ ok1(strgrp_iter_next(iter));
+ ok1(strgrp_iter_next(iter));
+ ok1(!strgrp_iter_next(iter));
+ strgrp_iter_free(iter);
+ strgrp_free(ctx);
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/api_new_free.c b/ccan/strgrp/test/api_new_free.c
new file mode 100644
index 00000000..da3575dd
--- /dev/null
+++ b/ccan/strgrp/test/api_new_free.c
@@ -0,0 +1,11 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+
+ plan_tests(1);
+ create(ctx, DEFAULT_SIMILARITY);
+ strgrp_free(ctx);
+ pass("Successfully initialised strgrp instance");
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/api_test_free_cb.c b/ccan/strgrp/test/api_test_free_cb.c
new file mode 100644
index 00000000..bf85b36e
--- /dev/null
+++ b/ccan/strgrp/test/api_test_free_cb.c
@@ -0,0 +1,16 @@
+#include "../test/helpers.h"
+
+static void
+data_cb(void *data) {
+ ok1(streq("1", (char *)data));
+}
+
+int main(void) {
+ struct strgrp *ctx;
+
+ plan_tests(1);
+ create(ctx, DEFAULT_SIMILARITY);
+ strgrp_add(ctx, "a", (void *)"1");
+ strgrp_free_cb(ctx, &data_cb);
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/api_test_print.c b/ccan/strgrp/test/api_test_print.c
new file mode 100644
index 00000000..8635882d
--- /dev/null
+++ b/ccan/strgrp/test/api_test_print.c
@@ -0,0 +1,15 @@
+#include "../test/helpers.h"
+
+int main(void) {
+ struct strgrp *ctx;
+
+ plan_tests(1);
+ create(ctx, DEFAULT_SIMILARITY);
+ strgrp_add(ctx, "a", (void *)"1");
+ strgrp_add(ctx, "a", (void *)"2");
+ strgrp_add(ctx, "b", (void *)"3");
+ strgrp_print(ctx);
+ strgrp_free(ctx);
+ pass("No errors");
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/helpers.c b/ccan/strgrp/test/helpers.c
new file mode 100644
index 00000000..24b5dfd0
--- /dev/null
+++ b/ccan/strgrp/test/helpers.c
@@ -0,0 +1,67 @@
+//#include "../test/helpers.h"
+#include "helpers.h"
+
+int
+one_group_from_two(struct strgrp *ctx,
+ const char *const k1, void *v1,
+ const char *const k2, void *v2) {
+ const struct strgrp_grp *grp1, *grp2;
+ struct strgrp_grp_iter *iter;
+ const struct strgrp_item *item;
+
+ plan_tests(10);
+ grp1 = strgrp_add(ctx, k1, v1);
+ ok1(grp1);
+ grp2 = strgrp_add(ctx, k2, v2);
+ ok1(grp2);
+ ok1(grp1 == grp2);
+ iter = strgrp_grp_iter_new(grp1);
+ item = strgrp_grp_iter_next(iter);
+ ok1(item);
+ ok1(streq(k1, strgrp_item_key(item)));
+ ok1(v1 == strgrp_item_value(item));
+ item = strgrp_grp_iter_next(iter);
+ ok1(item);
+ ok1(streq(k2, strgrp_item_key(item)));
+ ok1(v2 == strgrp_item_value(item));
+ ok1(!strgrp_grp_iter_next(iter));
+ strgrp_grp_iter_free(iter);
+ strgrp_free(ctx);
+ return exit_status();
+}
+
+int
+two_groups_from_two(struct strgrp *ctx,
+ const char *const k1, void *v1,
+ const char *const k2, void *v2) {
+ const struct strgrp_grp *grp1, *grp2;
+ struct strgrp_grp_iter *iter;
+ const struct strgrp_item *item;
+
+ plan_tests(11);
+ grp1 = strgrp_add(ctx, k1, v1);
+ ok1(grp1);
+ grp2 = strgrp_add(ctx, k2, v2);
+ ok1(grp2);
+ ok1(grp1 != grp2);
+ {
+ iter = strgrp_grp_iter_new(grp1);
+ item = strgrp_grp_iter_next(iter);
+ ok1(item);
+ ok1(streq(k1, strgrp_item_key(item)));
+ ok1(v1 == strgrp_item_value(item));
+ ok1(!strgrp_grp_iter_next(iter));
+ strgrp_grp_iter_free(iter);
+ }
+ {
+ iter = strgrp_grp_iter_new(grp2);
+ item = strgrp_grp_iter_next(iter);
+ ok1(item);
+ ok1(streq(k2, strgrp_item_key(item)));
+ ok1(v2 == strgrp_item_value(item));
+ ok1(!strgrp_grp_iter_next(iter));
+ strgrp_grp_iter_free(iter);
+ }
+ strgrp_free(ctx);
+ return exit_status();
+}
diff --git a/ccan/strgrp/test/helpers.h b/ccan/strgrp/test/helpers.h
new file mode 100644
index 00000000..8a754b0c
--- /dev/null
+++ b/ccan/strgrp/test/helpers.h
@@ -0,0 +1,29 @@
+#ifndef STRGRP_TEST_HELPERS
+#define STRGRP_TEST_HELPERS
+#include <stdbool.h>
+#include "ccan/str/str.h"
+#include "ccan/tap/tap.h"
+#include "../strgrp.h"
+
+#define DEFAULT_SIMILARITY 0.85
+
+#define create(dst, similarity) \
+ do { \
+ dst = strgrp_new(similarity); \
+ if (!dst) { \
+ fail("strgrp_new() returned NULL reference"); \
+ return 1; \
+ } \
+ } while (0)
+
+int
+one_group_from_two(struct strgrp *ctx,
+ const char *const k1, void *v1,
+ const char *const k2, void *v2);
+
+int
+two_groups_from_two(struct strgrp *ctx,
+ const char *const k1, void *v1,
+ const char *const k2, void *v2);
+
+#endif /* STRGRP_TEST_HELPERS */