8 Eylül 2007 Cumartesi

Reference and Explanation

There is an excellent article on "Fast String Searching" by John W. Ross in the "C/C++ Users Journal" of July 1995. A paper copy of the article will be put on library reserve for photocopying.

The code provided in the article has been typed in and is available through the links below. You can easily see how the routines work and make use of these programs.

Links to source code files

Main program for timing and testing

#include 
#include
#include
#include
#define MB 400000
#define MP 256
#define NCOPY 3
char *txtbuf;
int xxxFind(int n, char *txt, int m, char *pat);
main(int argc, char **argv)
{
char patbuf[MP];
FILE *in;
int i, m, n, t0, t1;
int nmatch = 0;
if (argc < 3) {
fprintf(stderr, "usage: %s pattern file\n", argv[0]);
exit(0);
}
if ((in = fopen(argv[2], "r")) == NULL) {
fprintf(stderr, "cannot open input file: %s\n", argv[2]);
exit(0);
}
strcpy(patbuf, argv[1]);
m = strlen(patbuf);
txtbuf = malloc(NCOPY * MB);
n = 0;
for (i = 0; i < NCOPY; i++) {
n += fread(txtbuf+n, sizeof(char), MB, in);
rewind(in);
}
close(in);
t0 = clock();
nmatch = xxxFind(n, txtbuf, m, patbuf);
t1 = clock() - t0;
printf("%d matches took %.3f\n", nmatch, (float)t1/CLOCKS_PER_SEC);
}
/* End of File */

Brute force string searching

#include 
int bfFind(int n, char *txt, int m, char *pat)
{
int i, j, k, lim;
int nmatch = 0;
lim = n - m + 1;
i = 0;
while (i < lim) {
k = i;
for (j = 0; j < m && txt[k] == pat[j]; j++)
k++;
if (j == m) {
nmatch++;
printf("%d \n", i);
i += m;
}
else
i++;
}
return(nmatch);
}
/* End of File */

Improve by first character search

#include 
#include
int fcFind(int n, char *txt, int m, char *pat)
{
int i, j, k;
int nmatch = 0;
char *cp;
cp = strchr(txt, pat[0]);
while (cp) {
i = cp - txt;
k = i;
for (j = 0; j < m && txt[k] == pat[j]; j++)
k ++;
if (j == m) {
nmatch++;
printf("%d \n", i);
cp = strchr(txt + i + m, pat[0]);
}
else
cp = strchr(txt + i + 1, pat[0]);
}
return(nmatch);
}
/* End of File */

Call C library string match function

#include 
#include
int ssFind(int n, char *txt, int m, char *pat)
{
int nmatch = 0;
char *cp;
cp = txt;
while (cp = strstr(cp, pat)) {
printf("%d\n", cp-txt);
nmatch++;
cp++;
}
return(nmatch);
}
/* End of File */

Improve by low frequency character search

#include 
#include
int fclFind(int n, char *txt, int m, char *pat, int pos)
{
int i, j, k;
int nmatch = 0;
char *cp;
cp = strchr(txt, pat[pos]);
while (cp) {
i = cp - txt;
k = i - pos;
for (j = 0; j < m && txt[k] == pat[j]; j++)
k++;
if (j == m) {
nmatch++;
printf("%d \n", i - pos);
cp = strchr(txt + i + m, pat[pos]);
}
else
cp = strchr(txt + i + 1, pat[pos]);
}
return(nmatch);
}
/* End of File */

Boyer-Moore-Horspool function

#include 
#define ALPHABETSZ 128
#define PATGRD 0xfe
#define TXTGRD 0xff
static int d[ALPHABETSZ];
void bmhInit(char *txt, int m, char *pat)
{
int i;
/* initialize */
pat[0] = PATGRD;
txt[0] = TXTGRD;
for (i = 0; i < ALPHABETSZ; i ++)
d[i] = m;
for (i = 1; i < m; i++)
d[pat[i]] = m - 1;
}

int bmhFind(int n, char *txt, int m, char *pat)
{
int i, j, k;
int nmatch = 0;
i = m;
while (i <= n) {
k = i;
for (j = m; txt[k] == pat[j]; j--)
k--;
if (j == 0) {
nmatch++;
printf("%d \n", k);
}
i += d[txt[i]];
}
return(nmatch);
}
/* End of File */

Main program for Boyer-Moore-Horspool function

#include 
#include
#include
#include
#define MB 400000
#define MP 256
#define NCOPY 3
char *txtbuf;
int bmhInit(char *txt, int m, char *pat);
int bmhFind(int n, char *txt, int m, char *pat);
main(int argc, char **argv)
{
char patbuf[MP];
FILE *in;
int i, m, n, t0, t1;
int nmatch = 0;
char *txtp, *patp = patbuf + 1;
if (argc < 3) {
fprintf(stderr, "usage: %s pattern file\n", argv[0]);
exit(0);
}
if ((in = fopen(argv[2], "r")) == NULL) {
fprintf(stderr, "cannot open input file: %s\n", argv[2]);
exit(0);
}
strcpy(patp, argv[1]);
m = strlen(patp);
txtbuf = malloc(NCOPY * MB);
txtp = txtbuf + 1;
n = 0;
for (i = 0; i < NCOPY; i++) {
n += fread(txtp + n, sizeof(char), MB, in);
rewind (in);
}
close (in);
bmhInit(txtbuf, m, patbuf);
t0 = clock();
nmatch = bmhFind(n, txtbuf, m, patbuf);
t1 = clock() - t0;
printf("%d matches took %.3f\n", nmatch, (float)t1/CLOCKS_PER_SEC);
}
/* End of File */

5 Types of Popularity for Ranking

  • Link popularity: simply links coming into any website from any and all sources. This is a broad term as other popularity types will fall under the effect of Link Popularity because the link is the method.
  • Industry Popularity: the relationship of a site in its known industry to the industry. Are the big industry sites pointing to the site? There are always a few blogs in every industry that stand out, and become celebrity blogs. Matt Cutts, Graywolf, Shoemoney, these are celebrity blogs they gain high industry traffic and link out to their industry. The “BlogRoll” becomes the popularity factor and passes far more power than any directory. Industry popularity focuses on a sites’ prominence in the industry, it can be effective, but it tends to make a group popular and misses the larger core of the industry, so this popularity needs to be factored appropriately. Problems with Industry popularity include blogroll spam, now very common and a potential cause for future devaluation of blogrolls.
  • Social Popularity: very similar to Industry popularity, except here it is values passed from social sites like digg and del.icio.us. Multiple instances of a site on a social site are factored in the algorithms.
  • Click Popularity: used lightly and in conjunction with analytics data to determine bounce rates. Since this is the least accurate measurement, it is believed to be used lightly in its importance for ranking. Click popularity is pulled mainly from the main search index and clicks are counted. Click popularity tends to me more of a theory and is definitely used in PPC campaigns to track and determine traffic patterns on sites.
  • Blog Popularity: this one is a “throw in” and is related to Google more than other engines, Google places more emphasis and trust in blogs, mainly because they have almost grown up on them and because they feel the human element is there. You can negotiate this till the cows come home, but there is more than enough evidence to say that Google favors blogs and blog links over the more traditional website. So, in summary, Blog Popularity is the ratio of incoming links solely from blogs – in theory.

How Web Search Engines Work

Search engines are the key to finding specific information on the vast expanse of the World Wide Web. Without sophisticated search engines, it would be virtually impossible to locate anything on the Web without knowing a specific URL. But do you know how search engines work? And do you know what makes some search engines more effective than others?

When people use the term search engine in relation to the Web, they are usually referring to the actual search forms that searches through databases of HTML documents, initially gathered by a robot.

There are basically three types of search engines: Those that are powered by robots (called crawlers; ants or spiders) and those that are powered by human submissions; and those that are a hybrid of the two.

Crawler-based search engines are those that use automated software agents (called crawlers) that visit a Web site, read the information on the actual site, read the site's meta tags and also follow the links that the site connects to performing indexing on all linked Web sites as well. The crawler returns all that information back to a central depository, where the data is indexed. The crawler will periodically return to the sites to check for any information that has changed. The frequency with which this happens is determined by the administrators of the search engine.


Human-powered search engines rely on humans to submit information that is subsequently indexed and catalogued. Only information that is submitted is put into the index.

Quick Search algorithm

Main features

  • simplification of the Boyer-Moore algorithm;
  • uses only the bad-character shift;
  • easy to implement;
  • preprocessing phase in O(m+sigma) time and O(sigma) space complexity;
  • searching phase in O(mn) time complexity;
  • very fast in practice for short patterns and large alphabets.

Description

The Quick Search algorithm uses only the bad-character shift table (see chapter Boyer-Moore algorithm). After an attempt where the window is positioned on the text factor y[j .. j+m-1], the length of the shift is at least equal to one. So, the character y[j+m] is necessarily involved in the next attempt, and thus can be used for the bad-character shift of the current attempt.

The bad-character shift of the present algorithm is slightly modified to take into account the last character of x as follows: for c in Sigma, qsBc[c]=min{i : 0 leq i < m and x[m-1-i]=c} if c occurs in x, m otherwise.

The preprocessing phase is in O(m+sigma) time and O(sigma) space complexity.

During the searching phase the comparisons between pattern and text characters during each attempt can be done in any order. The searching phase has a quadratic worst case time complexity but it has a good practical behaviour.

The C code

void preQsBc(char *x, int m, int qsBc[]) {
int i;

for (i = 0; i < ASIZE; ++i)
qsBc[i] = m + 1;
for (i = 0; i < m; ++i)
qsBc[x[i]] = m - i;
}


void QS(char *x, int m, char *y, int n) {
int j, qsBc[ASIZE];

/* Preprocessing */
preQsBc(x, m, qsBc);

/* Searching */
j = 0;
while (j <= n - m) {
if (memcmp(x, y + j, m) == 0)
OUTPUT(j);
j += qsBc[y[j + m]]; /* shift */
}
}