8 Eylül 2007 Cumartesi

Reference and Explanation

There is an excellent article on "Fast String Searching" by John W. Ross in the "C/C++ Users Journal" of July 1995. A paper copy of the article will be put on library reserve for photocopying.

The code provided in the article has been typed in and is available through the links below. You can easily see how the routines work and make use of these programs.

Links to source code files

Main program for timing and testing

#include 
#include
#include
#include
#define MB 400000
#define MP 256
#define NCOPY 3
char *txtbuf;
int xxxFind(int n, char *txt, int m, char *pat);
main(int argc, char **argv)
{
char patbuf[MP];
FILE *in;
int i, m, n, t0, t1;
int nmatch = 0;
if (argc < 3) {
fprintf(stderr, "usage: %s pattern file\n", argv[0]);
exit(0);
}
if ((in = fopen(argv[2], "r")) == NULL) {
fprintf(stderr, "cannot open input file: %s\n", argv[2]);
exit(0);
}
strcpy(patbuf, argv[1]);
m = strlen(patbuf);
txtbuf = malloc(NCOPY * MB);
n = 0;
for (i = 0; i < NCOPY; i++) {
n += fread(txtbuf+n, sizeof(char), MB, in);
rewind(in);
}
close(in);
t0 = clock();
nmatch = xxxFind(n, txtbuf, m, patbuf);
t1 = clock() - t0;
printf("%d matches took %.3f\n", nmatch, (float)t1/CLOCKS_PER_SEC);
}
/* End of File */

Brute force string searching

#include 
int bfFind(int n, char *txt, int m, char *pat)
{
int i, j, k, lim;
int nmatch = 0;
lim = n - m + 1;
i = 0;
while (i < lim) {
k = i;
for (j = 0; j < m && txt[k] == pat[j]; j++)
k++;
if (j == m) {
nmatch++;
printf("%d \n", i);
i += m;
}
else
i++;
}
return(nmatch);
}
/* End of File */

Improve by first character search

#include 
#include
int fcFind(int n, char *txt, int m, char *pat)
{
int i, j, k;
int nmatch = 0;
char *cp;
cp = strchr(txt, pat[0]);
while (cp) {
i = cp - txt;
k = i;
for (j = 0; j < m && txt[k] == pat[j]; j++)
k ++;
if (j == m) {
nmatch++;
printf("%d \n", i);
cp = strchr(txt + i + m, pat[0]);
}
else
cp = strchr(txt + i + 1, pat[0]);
}
return(nmatch);
}
/* End of File */

Call C library string match function

#include 
#include
int ssFind(int n, char *txt, int m, char *pat)
{
int nmatch = 0;
char *cp;
cp = txt;
while (cp = strstr(cp, pat)) {
printf("%d\n", cp-txt);
nmatch++;
cp++;
}
return(nmatch);
}
/* End of File */

Improve by low frequency character search

#include 
#include
int fclFind(int n, char *txt, int m, char *pat, int pos)
{
int i, j, k;
int nmatch = 0;
char *cp;
cp = strchr(txt, pat[pos]);
while (cp) {
i = cp - txt;
k = i - pos;
for (j = 0; j < m && txt[k] == pat[j]; j++)
k++;
if (j == m) {
nmatch++;
printf("%d \n", i - pos);
cp = strchr(txt + i + m, pat[pos]);
}
else
cp = strchr(txt + i + 1, pat[pos]);
}
return(nmatch);
}
/* End of File */

Boyer-Moore-Horspool function

#include 
#define ALPHABETSZ 128
#define PATGRD 0xfe
#define TXTGRD 0xff
static int d[ALPHABETSZ];
void bmhInit(char *txt, int m, char *pat)
{
int i;
/* initialize */
pat[0] = PATGRD;
txt[0] = TXTGRD;
for (i = 0; i < ALPHABETSZ; i ++)
d[i] = m;
for (i = 1; i < m; i++)
d[pat[i]] = m - 1;
}

int bmhFind(int n, char *txt, int m, char *pat)
{
int i, j, k;
int nmatch = 0;
i = m;
while (i <= n) {
k = i;
for (j = m; txt[k] == pat[j]; j--)
k--;
if (j == 0) {
nmatch++;
printf("%d \n", k);
}
i += d[txt[i]];
}
return(nmatch);
}
/* End of File */

Main program for Boyer-Moore-Horspool function

#include 
#include
#include
#include
#define MB 400000
#define MP 256
#define NCOPY 3
char *txtbuf;
int bmhInit(char *txt, int m, char *pat);
int bmhFind(int n, char *txt, int m, char *pat);
main(int argc, char **argv)
{
char patbuf[MP];
FILE *in;
int i, m, n, t0, t1;
int nmatch = 0;
char *txtp, *patp = patbuf + 1;
if (argc < 3) {
fprintf(stderr, "usage: %s pattern file\n", argv[0]);
exit(0);
}
if ((in = fopen(argv[2], "r")) == NULL) {
fprintf(stderr, "cannot open input file: %s\n", argv[2]);
exit(0);
}
strcpy(patp, argv[1]);
m = strlen(patp);
txtbuf = malloc(NCOPY * MB);
txtp = txtbuf + 1;
n = 0;
for (i = 0; i < NCOPY; i++) {
n += fread(txtp + n, sizeof(char), MB, in);
rewind (in);
}
close (in);
bmhInit(txtbuf, m, patbuf);
t0 = clock();
nmatch = bmhFind(n, txtbuf, m, patbuf);
t1 = clock() - t0;
printf("%d matches took %.3f\n", nmatch, (float)t1/CLOCKS_PER_SEC);
}
/* End of File */

5 Types of Popularity for Ranking

  • Link popularity: simply links coming into any website from any and all sources. This is a broad term as other popularity types will fall under the effect of Link Popularity because the link is the method.
  • Industry Popularity: the relationship of a site in its known industry to the industry. Are the big industry sites pointing to the site? There are always a few blogs in every industry that stand out, and become celebrity blogs. Matt Cutts, Graywolf, Shoemoney, these are celebrity blogs they gain high industry traffic and link out to their industry. The “BlogRoll” becomes the popularity factor and passes far more power than any directory. Industry popularity focuses on a sites’ prominence in the industry, it can be effective, but it tends to make a group popular and misses the larger core of the industry, so this popularity needs to be factored appropriately. Problems with Industry popularity include blogroll spam, now very common and a potential cause for future devaluation of blogrolls.
  • Social Popularity: very similar to Industry popularity, except here it is values passed from social sites like digg and del.icio.us. Multiple instances of a site on a social site are factored in the algorithms.
  • Click Popularity: used lightly and in conjunction with analytics data to determine bounce rates. Since this is the least accurate measurement, it is believed to be used lightly in its importance for ranking. Click popularity is pulled mainly from the main search index and clicks are counted. Click popularity tends to me more of a theory and is definitely used in PPC campaigns to track and determine traffic patterns on sites.
  • Blog Popularity: this one is a “throw in” and is related to Google more than other engines, Google places more emphasis and trust in blogs, mainly because they have almost grown up on them and because they feel the human element is there. You can negotiate this till the cows come home, but there is more than enough evidence to say that Google favors blogs and blog links over the more traditional website. So, in summary, Blog Popularity is the ratio of incoming links solely from blogs – in theory.

How Web Search Engines Work

Search engines are the key to finding specific information on the vast expanse of the World Wide Web. Without sophisticated search engines, it would be virtually impossible to locate anything on the Web without knowing a specific URL. But do you know how search engines work? And do you know what makes some search engines more effective than others?

When people use the term search engine in relation to the Web, they are usually referring to the actual search forms that searches through databases of HTML documents, initially gathered by a robot.

There are basically three types of search engines: Those that are powered by robots (called crawlers; ants or spiders) and those that are powered by human submissions; and those that are a hybrid of the two.

Crawler-based search engines are those that use automated software agents (called crawlers) that visit a Web site, read the information on the actual site, read the site's meta tags and also follow the links that the site connects to performing indexing on all linked Web sites as well. The crawler returns all that information back to a central depository, where the data is indexed. The crawler will periodically return to the sites to check for any information that has changed. The frequency with which this happens is determined by the administrators of the search engine.


Human-powered search engines rely on humans to submit information that is subsequently indexed and catalogued. Only information that is submitted is put into the index.

Quick Search algorithm

Main features

  • simplification of the Boyer-Moore algorithm;
  • uses only the bad-character shift;
  • easy to implement;
  • preprocessing phase in O(m+sigma) time and O(sigma) space complexity;
  • searching phase in O(mn) time complexity;
  • very fast in practice for short patterns and large alphabets.

Description

The Quick Search algorithm uses only the bad-character shift table (see chapter Boyer-Moore algorithm). After an attempt where the window is positioned on the text factor y[j .. j+m-1], the length of the shift is at least equal to one. So, the character y[j+m] is necessarily involved in the next attempt, and thus can be used for the bad-character shift of the current attempt.

The bad-character shift of the present algorithm is slightly modified to take into account the last character of x as follows: for c in Sigma, qsBc[c]=min{i : 0 leq i < m and x[m-1-i]=c} if c occurs in x, m otherwise.

The preprocessing phase is in O(m+sigma) time and O(sigma) space complexity.

During the searching phase the comparisons between pattern and text characters during each attempt can be done in any order. The searching phase has a quadratic worst case time complexity but it has a good practical behaviour.

The C code

void preQsBc(char *x, int m, int qsBc[]) {
int i;

for (i = 0; i < ASIZE; ++i)
qsBc[i] = m + 1;
for (i = 0; i < m; ++i)
qsBc[x[i]] = m - i;
}


void QS(char *x, int m, char *y, int n) {
int j, qsBc[ASIZE];

/* Preprocessing */
preQsBc(x, m, qsBc);

/* Searching */
j = 0;
while (j <= n - m) {
if (memcmp(x, y + j, m) == 0)
OUTPUT(j);
j += qsBc[y[j + m]]; /* shift */
}
}

24 Ağustos 2007 Cuma

Search algorithm

In computer science, a search algorithm to apply knowledge about the structure of the , broadly speaking, is an algorithm that takes a problem as input and returns a solution to the problem, usually after evaluating a number of possible solutions. Most of the algorithms studied by computer scientists that solve problems are kinds of search algorithms. The set of all possible solutions to a problem is called the search space. Brute-force search or "naïve"/uninformed search algorithms use the simplest, most intuitive method of searching through the search space, whereas informed search algorithms use heuristicssearch space to try to reduce the amount of time spent searching

Uninformed search

An uninformed search algorithm is one that does not take into account the specific nature of the problem. As such, they can be implemented in general, and then the same implementation can be used in a wide range of problems thanks to abstraction. The drawback is that most search spaces are extremely large, and an uninformed search (especially of a tree) will take a reasonable amount of time only for small examples. As such, to speed up the process, sometimes only an informed search will do.

List search

List search algorithms are perhaps the most basic kind of search algorithm. The goal is to find one element of a set by some key (perhaps containing other information related to the key). As this is a common problem in computer science, the computational complexity of these algorithms has been well studied. The simplest such algorithm is linear search, which simply examines each element of the list in order. It has expensive O(n) running time, where n is the number of items in the list, but can be used directly on any unprocessed list. A more sophisticated list search algorithm is binary search; it runs in O(log n) time. This is significantly better than linear search for large lists of data, but it requires that the list be sorted before searching (see sorting algorithm) and also be random access. Interpolation search is better than binary search for large sorted lists with fairly even distributions, but has a worst-case running time of O(n).

Grover's algorithm is a quantum algorithm that offers quadratic speedup over the classical linear search for unsorted lists. However, it requires a currently non-existent quantum computer on which to run.

Hash tables are also used for list search, requiring only constant time for search in the average case, but more space overhead and terrible O(n) worst-case search time. Another search based on specialized data structures uses self-balancing binary search trees and requires O(log n) time to search; these can be seen as extending the main ideas of binary search to allow fast insertion and removal. See associative array for more discussion of list search data structures.

Most list search algorithms, such as linear search, binary search, and self-balancing binary search trees, can be extended with little additional cost to find all values less than or greater than a given key, an operation called range search. The glaring exception is hash tables, which cannot perform such a search efficiently.

Tree search

Tree search algorithms are the heart of searching techniques. These search trees of nodes, whether that tree is explicit or implicit (generated on the go). The basic principle is that a node is taken from a data structure, its successors examined and added to the data structure. By manipulating the data structure, the tree is explored in different orders for instance level by level (Breadth-first search) or reaching a leaf node first and backtracking (Depth-first search). Other examples of tree-searches include Iterative-deepening search, Depth-limited search, Bidirectional search and Uniform-cost search.

Graph search

Many of the problems in graph theory can be solved using search algorithms, such as Dijkstra's algorithm, Kruskal's algorithm, the nearest neighbour algorithm, and Prim's algorithm. These can be seen as extensions of the tree-search algorithms

SQL search

Many of the problems in Tree search can be solved using SQL type searches. SQL typically works best on Structured data. It offers one advantage over hierarchical type search in that it allows access to the data in many different ways. In a Hierarchical search your path is forced by the branches of the tree (example name by alphabetical order) while with SQL you have the flexibility of accessing the data along multiple directions (name, address, income etc...).

Tradeoff Based search

While SQL search offers great flexibility to search the data, it still operates like a computer does: by constraints. In SQL, constraints are used to eliminate the data, while tradeoff based search uses a more "human" metaphor. For example if you are searching for a car in a dataset, your SQL statement looks like: select car from dataset where price < $30,000, and Consumption > 30MPG, and Color = 'RED'. While a tradeoff type query would look like "I like the red car, but I will settle for the blue if it is $2,000 cheaper".

Informed search

In an informed search, a heuristic that is specific to the problem is used as a guide. A good heuristic will make an informed search dramatically out-perform any uninformed search.

There are few prominent informed list-search algorithms. A possible member of that category is a hash table with a hashing function that is a heuristic based on the problem at hand. Most informed search algorithms explore trees. Like the uninformed algorithms, they can be extended to work for graphs as well.

Adversarial search

In games such as chess, there is a game tree of all possible moves by both players and the resulting board configurations, and we can search this tree to find an effective playing strategy. This type of problem has the unique characteristic that we must account for any possible move our opponent might make. To account for this, game-playing computer programs, as well as other forms of artificial intelligence like machine planning, often use search algorithms like the minimax algorithm, search tree pruning, and alpha-beta pruning.

Constraint satisfaction

This is a type of search which solves constraint satisfaction problems where, rather than looking for a path, the solution is simply a set of values assigned to a set of variables. Because the variables can be processed in any order, the usual tree search algorithms are too inefficient. Methods of solving constraint problems include combinatorial search and backtracking, both of which take advantage of the freedom associated with constraint problems

Search engine

A search engine is an information retrieval system designed to help find information stored on a computer system. Search engines help to minimize the time required to find information and the amount of information which must be consulted, akin to other techniques for managing information overload.

The most popular form of a search engine is a Web search engine which searches for information on the public World Wide Web. Other kinds of search engines include enterprise search engines, which search on intranets, personal search engines, and mobile search engines

How search engines work

  • Querying
  • Ranking
  • Indexing
  • Natural Language Search

Querying

Search engines provide an interface to a group of items that enables users to specify criteria about an item of interest and have the engine find the matching items within the group.

In the most popular form of search, items are documents or web pages and the criteria are words or concepts that the documents may contain.

Ranking

A Boolean search for an item within a group of items will either return the exact matching item or nothing. This is a rather orthodox search method where the equality between the desired item and the actual item must be exact. In application, it is sometimes far more beneficial and useful to incorporate a more lax measure of similarity between the desired item(s) and the items that exist in the group being searched.

For example, instead of finding only the exact book in a library, a library search engine may return a list of 'similar' books, with the exact book listed first.

The list of items that meet the criteria specified by the query are typically sorted, or ranked, in some regard so as to place the most 'relevant' items first. Placing the most relevant items first reduces the time required by users to determine whether one or more of the resulting items are sufficiently similar to the query. It has become common knowledge through the use of Web search engines that the further down the list of matching items you browse, the less relevant the items become.

Indexing

To provide a set of matching items quickly, a search engine will typically collect information, or metadata, about the group of items under consideration beforehand. For example, a library search engine may determine the author of each book automatically and add the author name to a description of each book. Users can then search for books by the author's name. Other metadata in this example might include the book title, the number of pages in the book, the date it was published, and so forth.

The metadata collected about each item is typically stored on a computer in the form of an index. The index typically requires a smaller amount of computer storage and provides a way for the search engine to calculate the relevance, or similarity, between the query and the set of items.

Natural Language Search

Natural Language Search is the term used to describe web search engines that apply natural language processing of some form. Traditional search engines tend to use a non-linguistic model of language and the hypothesis is that NLS will provide better results - that is to say, results that more accurately and efficiently support a user's need.

Web directory

A web directory or link directory is a directory on the World Wide Web. It specializes in linking to other web sites and categorizing those links.

A web directory is not a search engine, and does not display lists of web pages based on keywords, instead it lists web sites by category and subcategory. The categorization is usually based on the whole web site, rather than one page or a set of keywords, and sites are often limited to inclusion in only one or two categories. Web directories often allow site owners to directly submit their site for inclusion, and have editors review submissions for fitness.

RSS directories are similar to web directories, but contain collections of RSS feeds, instead of links to web sites.

General

Some directories are very general in scope and list websites across a wide range of categories, regions and languages. But there are also a large number of niche directories, which focus on restricted regions, single languages, or specialist sectors.

Examples of well known, general, web directories are Yahoo! Directory and the Open Directory Project (ODP). ODP is significant due to its extensive categorization and large number of listings and its free availability for use by other directories and search engines (many sites violate its terms of use by using its content without acknowledgement).

A debate over the quality of directories and databases continues, as search engines use ODP's content without real integration, and some experiment using clustering. There have been many attempts to make directory development easier, whether using a "links for all" type link submission site using a script, or any number of available PHP portals and programs. Recently, social software techniques have spawned new efforts of categorization, with Amazon.com adding tagging to their product pages.

Directories have various types of listings, often dependent upon the price paid for inclusion:

  • Free Submission - there is no charge for review of the site
  • Reciprocal Link - the site submitted must link back to the directory in order to be listed
  • Paid Submissions - a fee is charged for reviewing the submitted link
  • No Follow - there is a rel="nofollow" attribute associated with the link, meaning search engines will not follow the link.
  • Featured Link - the link is given a premium position in the category where it is submitted
  • Featured Homepage Link - the link may be listed on the homepage of the directory.
  • Bid for Position - a recent innovation (2007) where sites are ordered based on bids

Human edited directories

Human-edited directories are often targeted by SEOs on the basis that links from reputable sources will improve rankings in the major search engines. Some directories may prevent search engines from rating a displayed link by using redirects, nofollow attributes, or other techniques.

Many human-edited directories, including the Open Directory Project and the World Wide Web Virtual Library, are edited by volunteers, who are often experts in particular categories. These directories are sometimes criticized due to long delays in approving submissions, or for rigid organizational structures and disputes among volunteer editors.

In response to these criticisms, some volunteer-edited directories have adopted wiki technology, to allow broader community participation in editing the directory (at the risk of introducing lower-quality, less objective entries).

Another direction taken by some web directories is the paid for inclusion model. This method enables the directory to offer timely inclusion for submissions and generally fewer listings as a result of the paid model. They often offer additional listing options to further enhance listings, including features listings and additional links to inner pages of the listed web site. These options typically have an additional fee associated, but offer significant help and visibility to sites and/or their inside pages.

Today submission of websites to web directories is considered as a common SEO (search engine optimization) technique to get vital back-links for the submitted web site. One distinctive feature of 'directory submission' is that it can not be fully automated like search engine submissions. Manual directory submission is a tedious and time consuming job and is often outsourced by the webmasters.