I am looking at a better hashing algorithms for exim -- especialy for file
system access. Attached is a patch for exim-3.01 (expand.c) that does the
following:
${phash_n_m:<string>} or ${ph_n_m:<string>}
where
n is the number of characters that are used to compile the hash,
m is the number of characters that are used as a hash.
<string> is any string.
It returns only one character from the set:
static char *hashcodes = "abcdefghijklmnopqrtsuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789";
Below are some results of the hash on a set ("aa", "ab", etc...) of
double digits. "-nbr" is "n", and "-hash" is "m"... It does split things
fairly well, and can scale nicely -- both in "n" and "m".
; hash -nbr 2 -hash 2
a has 666 entries
b has 666 entries
total is 1332
; hash -nbr 2 -hash 8
a has 165 entries
b has 168 entries
c has 170 entries
d has 168 entries
e has 165 entries
f has 165 entries
g has 166 entries
h has 165 entries
total is 1332
What do people think? Is there anything you'd like to suggest as to making
this better/more scalable/returning more than one value/whatever?
Of course, criticism are welcome -- if they are of the constructive type ;)
BTW If you were thinking of doing something along those lines with hashing,
I would be happy to help/expand the code to match your spec.
--
Yann Golanski Internet Systems Developer
yann.golanski@??? The Planet Online
*** expand.orig Fri May 21 14:21:08 1999
--- expand.c Wed May 26 14:23:59 1999
***************
*** 961,964 ****
--- 961,967 ----
hash_<n>_<m> hash the string, making one that is of length n,
using m (default 26) characters from hashcodes
+ phash_<n>_<m> hash the string, using n characters and
+ mapping them onto m (default 26) characters
+ from hashcodes
lc lowercase the string
uc uppercase the string
***************
*** 1777,1785 ****
left. hash_n or h_n makes a hash of length n from the string, yielding
n characters from the set a-z; hash_n_m makes a hash of length n, but
! uses m characters from the set a-zA-Z0-9. */
!
! if (strncmp(name, "length_", 7) == 0 || strncmp(name, "l_", 2) == 0 ||
! strncmp(name, "substr_", 7) == 0 || strncmp(name, "s_", 2) == 0 ||
! strncmp(name, "hash_", 5) == 0 || strncmp(name, "h_", 2) == 0)
{
int sign = 1;
--- 1780,1792 ----
left. hash_n or h_n makes a hash of length n from the string, yielding
n characters from the set a-z; hash_n_m makes a hash of length n, but
! uses m characters from the set a-zA-Z0-9. phash_n_m (or ph_n_m) takes n
! characters and hash them to a sub set m, and returns only ONE
! character. */
!
! if (strncmp(name, "length_", 7) == 0 || strncmp(name, "l_", 2) == 0 ||
! strncmp(name, "substr_", 7) == 0 || strncmp(name, "s_", 2) == 0 ||
! strncmp(name, "hash_", 5) == 0 || strncmp(name, "h_", 2) == 0 ||
! strncmp(name, "phash_", 6) == 0 || strncmp(name, "ph_", 3) == 0
! )
{
int sign = 1;
***************
*** 1799,1804 ****
if (name[0] == 's' && *num == '-') { sign = -1; num++; }
}
! else { pn = &len; len = 0; }
!
while (*num != 0)
{
--- 1806,1816 ----
if (name[0] == 's' && *num == '-') { sign = -1; num++; }
}
! else
! {
! pn = &len;
! len = 0;
! }
!
!
while (*num != 0)
{
***************
*** 1815,1819 ****
goto EXPAND_FAILED;
}
! else *pn = (*pn)*10 + *num++ - '0';
}
offset *= sign;
--- 1827,1832 ----
goto EXPAND_FAILED;
}
! else
! *pn = (*pn)*10 + *num++ - '0';
}
offset *= sign;
***************
*** 1822,1826 ****
second is the number of characters to use. */
! if (name[0] == 'h')
{
hashcount = (len < 0)? 26 : len;
--- 1835,1839 ----
second is the number of characters to use. */
! if (name[0] == 'h' || name[0] == 'p')
{
hashcount = (len < 0)? 26 : len;
***************
*** 1835,1838 ****
--- 1848,1852 ----
}
+
/* Otherwise, a negative offset positions from the rhs (happens only
for substr) */
***************
*** 1869,1872 ****
--- 1883,1887 ----
offset is always zero. */
+
if (sublen < offset + len) len = sublen - offset;
if (len > 0)
***************
*** 1885,1890 ****
for (i = 0; i < len; i++)
sub[i] = hashcodes[(uschar)(sub[i]) % hashcount];
}
! yield = string_cat(yield, &size, &ptr, sub + offset, len);
}
continue;
--- 1900,1920 ----
for (i = 0; i < len; i++)
sub[i] = hashcodes[(uschar)(sub[i]) % hashcount];
+ yield = string_cat(yield, &size, &ptr, sub + offset, len);
}
!
! /* Planet internal hashing function:
! * Takes in phash_n_m where n is the number of characters to hash
! * and m is the set of characters returned by the hash.
! */
! if ( name[0] == 'p' )
! {
! int i;
! int total = 0;
!
! for (i = 0; i < len; i++ )
! total += (uschar)(sub[i]);
! sub[0] = hashcodes[total % hashcount];
! yield = string_cat(yield, &size, &ptr, &sub[0], 1);
! }
}
continue;