Published by
Feb 16, 2012 (last update: Feb 16, 2012)

Input correction

Score: 3.5/5 (49 votes)
*****
Note: I'm not 100% confident that my algorithm for comparing strings (see the function "similarity()" below) is correct; if anyone has any improvements to make, feel free to PM me.

I've written a function which can suggest corrections to invalid user input.

As an example, when using git(1), if you type "git comit" it says, "Did you mean this? / commit". It detects that the parameter ("comit") is invalid and then suggests a correction ("commit"). That's what this function does (or tries to do).

It does this by comparing two strings and generating a 'score' for how similar they are (by summing the absolute values of the return values of strcmp(3) when applied to the two strings; zero is the most similar, there isn't technically a limit on how dissimilar two strings can be). It uses these scores to build an associative array consisting of the score and the corresponding string. The array is then sorted (using qsort(3)) so that the keys are in the correct order.

Source code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* Copyright (C) 2012 Chrisname.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#define MIN(a, b)	(((a) < (b)) ?  (a) : (b))
#define MAX(a, b)	(((a) > (b)) ?  (a) : (b))
#define ABS(x)		(((x) < 0)   ? -(x) : (x))

/**
 * A simple associative array.
 */
struct assoc_array {
	int		key;	/**< The key value. */
	const char*	value;	/**< The string value. */
};

/* Comparator for qsort(3) */
static int qsort_compare_keys(const void* p1, const void* p2)
{
	struct assoc_array* node1 = (struct assoc_array*)p1;
	struct assoc_array* node2 = (struct assoc_array*)p2;
	/* Return the difference between 'node1->key' and 'node2->key'
	 */
	return node1->key - node2->key;
}

/** Returns the absolute similarity score for 's1' and 's2' */
static int similarity(const char* s1, const char* s2)
{
	int score = 0;
	unsigned long len1 = strlen(s1), len2 = strlen(s2), len = MAX(len1, len2);
	int i = 0;
	/* Sum the absolute values of the differences between each character of
	 * s1 and s2 given by strcmp(3)
	 */
	while (len --> 0) {
		int abs;
		/* If one string has ended, compare the rest of the other string
		 * with zero. Otherwise, compare the two strings as normal.
		 */
		if (len1 >= i && len2 < i) {
			int x = 0 - s1[i];
			score += ABS(x);
		} else if (len2 >= i && len1 < i) {
			int x = 0 - s2[i];
			score += ABS(x);
		} else {
			abs = ABS(strcmp(s1 + i, s2 + i));
			score += abs;
			/* Skip past the non-matching char so we don't compare it again
			 */
			while (s1[i] == s2[i] && len > 0) {
				++i;
				--len;
			}
			++i;
		}
	}
	return score;
}

/**
 * \brief	Suggests corrections to user input.
 * \param key	The key string.
 * \param values An array of strings to compare with the key string.
 * \param count	The number of elements in array 'values'.
 * \param n	The number of outputs to print.
 * \param reverse Whether to print output in reverse.
 * \param detail Whether to print detailed information.
 */
void correct(const char* key, char* values[], int count, long n, int reverse, int detail)
{
	int i;
	struct assoc_array* aa = malloc(sizeof(struct assoc_array) * count);
	/* Build an associative array out of values
	 */
	for (i = 0; i < count; ++i) {
		aa[i].key   = similarity(key, values[i]);
		aa[i].value = values[i];
	}
	/* Sort 'aa' by key
	 */
	qsort(aa, count, sizeof(struct assoc_array), qsort_compare_keys);
	/* Print the values in key order
	 */
	if (n == 0)
		n = count;
	if (reverse) {
		if (n)
			n = MAX(n, count - n);
		while (n --> 0) {
			if (detail)
				printf("aa[%d] = { %d, %s }\n", n, aa[n].key,
								aa[n].value);
			else
				printf("%s\n", aa[n].value);
		}
	} else {
		if (n)
			n = MIN(n, count);
		for (i = 0; i < n; ++i) {
			if (detail)
				printf("aa[%d] = { %d, %s }\n", i, aa[i].key,
								aa[i].value);
			else
				printf("%s\n", aa[i].value);
		}
	}
	free(aa);
}


Example usage:
Here is the source code of the program I created the function for, which I wrote for use in shell scripts. The code above has been omitted; simply paste it in place of the "PASTE HERE" marker in the code below.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
 * correct - finds which parameter/s is/are most similar to the key parameter.
 *
 * Copyright (C) 2012 Chrisname.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define PROGRAM_NAME	"correct"
#define VERSION_MAJOR	1
#define VERSION_MINOR	0
#define VERSION_PATCH	0

/*
 * PASTE HERE.
 */

/** Shows the usage message */
static void show_usage(const char* argv0)
{
	printf("correct - finds which parameter/s is/are most similar to the key parameter.\n"
		"Usage: %s [options] <key> <value 1> [value 2 [value ... [value n ]]]\n"
		"Options:\n"
		"\t-h,--help\tShow this help message\n"
		"\t-v,--version\tShow version information\n"
		"\t-n,--number\tSet the number of parameters to print. 0 = all. Default: 1\n"
		"\t-r,--reverse\tPrint output in reverse\n"
		"\t-d,--detail\tPrint more detailed information\n", argv0);
}

/** Shows the version message */
static void show_version()
{
	printf(PROGRAM_NAME " version %d.%d%d\n", VERSION_MAJOR, VERSION_MINOR,
								VERSION_PATCH);
}

int main(int argc, char* argv[])
{
	if (argc < 2) {
		show_usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	int reverse = 0, detail = 0;
	long n = 1;
	for (;;) {
		static const char* optstring = "hvn:rd";
		static struct option longopts[] = {
			{ "help",	0, NULL, 'h' },
			{ "version",	0, NULL, 'v' },
			{ "number",	1, NULL, 'n' },
			{ "reverse",	0, NULL, 'r' },
			{ "detail",	1, NULL, 'd' }
		};
		int c = getopt_long(argc, argv, optstring, longopts, NULL);
		if (c < 0)
			break;
		switch (c) {
			case 'h':
				show_usage(argv[0]);
				break;
			case 'v':
				show_version();
				break;
			case 'n':
				n = strtol(optarg, NULL, 10);
				break;
			case 'r':
				reverse = 1;
				break;
			case 'd':
				detail = 1;
				break;
			default:
				break;
		}
	}
	if ((argc - optind) < 2) {
		show_usage(argv[0]);
		exit(EXIT_FAILURE);
	}
	correct(argv[optind], &argv[optind + 1], argc - optind - 1, n, reverse, detail);
	return 0;
}


Output:
$ ./correct -n 0 hello hallo hail help hello
hello                    <-- This string is equal to the key string
hallo                    <-- This string has a similarity of 4 to the key string
help                     <-- This string has a similarity of 115 to the key string
hail                     <-- This string has a similarity of 118 to the key string