<?

/**
 *           RiSearch PHP
 * 
 * web search engine, version 0.1b
 * (c) Sergej Tarasov, 2000-2002
 * 
 * Homepage: http://risearch.org/
 * email: risearch@risearch.org
 * Last modified: 11.11.2002
 */



print "Start indexing<BR>\n";

include "config.php";

#DEFINE CONSTANTS
$cfn = 0;
$cwn = 0;
$kbcount = 0;

$fp_FINFO = fopen ("$FINFO", "w");
fwrite($fp_FINFO, "\n");
$fp_SITEWORDS = fopen ("$SITEWORDS", "wb");
$fp_WORD_IND = fopen ("$WORD_IND", "wb");



$time1 = getmicrotime();

start_spidering();

$time2 = getmicrotime();
$time = $time2-$time1;
print "<BR>Scan took $time sec.<BR>";


print "Writing SITEWORDS\n";
    $pos_sitewords = ftell($fp_SITEWORDS);
    $pos_word_ind  = ftell($fp_WORD_IND);
    $to_print_sitewords = "";
    $to_print_word_ind  = "";
    foreach($words as $word=>$value) {
        $cwn++;
        $words_word_dum = pack("NN",$pos_sitewords+strlen($to_print_sitewords),
    	                        $pos_word_ind+strlen($to_print_word_ind));
    	$to_print_sitewords .= "$word\x0A";
    	$to_print_word_ind .= pack("N",strlen($value)/4).$value;
    	$words[$word] = $words_word_dum;
    	if (strlen($to_print_word_ind) > 32000) {
    	    fwrite($fp_SITEWORDS, $to_print_sitewords);
    	    fwrite($fp_WORD_IND, $to_print_word_ind);
    	    $to_print_sitewords = "";
    	    $to_print_word_ind  = "";
    	    $pos_sitewords = ftell($fp_SITEWORDS);
    	    $pos_word_ind  = ftell($fp_WORD_IND);
    	}

    }
    fwrite($fp_SITEWORDS, $to_print_sitewords);
    fwrite($fp_WORD_IND, $to_print_word_ind);
fclose($fp_SITEWORDS);
fclose($fp_WORD_IND);

print "Build hash\n";

build_hash();

print "$cfn files are indexed\n";


#=====================================================================

function start_spidering() {

    global $start_url, $allow_url;

foreach ($start_url as $v) {
    $to_visit[$v] = 1;
}
$visited = array();

do {

    if (count($to_visit) == 0) {
        break;
    } else {
        list ($url,) = each($to_visit);
    }
    
    print "Url: $url\n";
    
    $fp = fopen($url,"r");
    if ( $fp == FALSE ) {
       print "Error in opening file: $url\n";
    } else {
        $text = "";
        while (!feof ($fp)) {
            $text .= fgets($fp, 4096);
        }
    }
    $visited[$url] = 1;
    print "$url - ".strlen($text)." bytes\n";

    $base = $url;
    if (preg_match_all("/<base\\s+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches,PREG_SET_ORDER)) {
        $base = $matches[0][2];
    }
    
    $links = get_link($text);
    foreach ($links as $k => $v) {
        $new_link = get_absolute_url($base,$k);
        $new_link = preg_replace("/#.*/","",$new_link);
        $new_link_stripped = preg_replace("/\?.*/","",$new_link);
        if ( check_url($new_link_stripped)) {
            if ( ! array_key_exists($new_link,$visited)) {
                $to_visit[$new_link] = 1;
            }
        }
    }
    
    index_file($text,$url);
    
    unset($to_visit[$url]);


} while (1);


}
#=====================================================================

function index_file($html_text,$url) {

    global $cfn, $kbcount, $descr_size, $min_length, $stop_words_array, $use_esc;
    global $use_selective_indexing, $no_index_strings;
    global $use_META, $use_META_descr;
    global $fp_FINFO;
    global $words;
    
    
    $cfn++;
    $size = strlen($html_text);
    $kbcount += intval($size/1024);
    print "$cfn -> $url; totalsize -> $kbcount kb<BR>\n";
    

    # Delete parts of document, which should not be indexed
    if ($use_selective_indexing == "YES") {
        foreach ($no_index_strings as $k => $v) {
    	    $html_text = preg_replace("/$k.*?$v/s"," ",$html_text);
    	}
    }
    
    
    preg_match("/<title>\s*(.*?)\s*<\/title>/i",$html_text,$matches);
    $title = $matches[1];
    preg_replace("/\s+/"," ",$title);
    if ($title == "") { $title = "No title"; }
    
    $keywords = "";
    $description = "";
    if ($use_META == "YES") { 
        $res = get_META_info($html_text);
        $keywords = $res[0];
        $description = $res[1];
    }

    $html_text = preg_replace("/<!--.*?-->/s"," ",$html_text);
    $html_text = preg_replace("/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/s"," ",$html_text);
    $html_text = preg_replace("/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/s"," ",$html_text);
    $html_text = preg_replace("/<[^>]*>/s"," ",$html_text);
    if ($use_esc == "YES") { $html_text = preg_replace_callback("/&[a-zA-Z0-9#]*?;/", 'esc2char', $html_text); }

    if (($use_META_descr == "YES") & ($description != "")) {
        $descript = substr($description,0,$descr_size);
    } else {
        $html_text = preg_replace("/\s+/s"," ",$html_text);
        $descript = substr($html_text,0,$descr_size);
    }

    $html_text = $html_text." ".$keywords." ".$description;

    $html_text = preg_replace("/[^a-zA-Z--$numbers ]/"," ",$html_text);
    $html_text = preg_replace("/\s+/s"," ",$html_text);
    $html_text = strtolower($html_text);
    
    $words_temp = array();
    
    $pos = 0;
    do  {
        $new_pos = strpos($html_text," ",$pos);
        if ($new_pos === FALSE) {
            $word = substr($html_text,$pos);
            $words_temp[$word] = 1;
            break;
        };
        $word = substr($html_text,$pos,$new_pos-$pos);
        $words_temp[$word] = 1;
        $pos = $new_pos+1;
    } while (1>0);

    

    $pos = ftell($fp_FINFO);
    $pos = pack("N",$pos);
    fwrite($fp_FINFO, "$url::$size::$title::$descript\n");
    
    foreach($words_temp as $word => $val) {
        if (strlen($word) < $min_length) { continue; }
        if (array_key_exists($word,$stop_words_array)) { continue; }
        $words[$word] .= $pos;
    }    
    
    
    unset($words_temp);
    unset($words_temp2);
    
}
#=====================================================================

function build_hash() {

    global $words;
    global $HASHSIZE, $INDEXING_SCHEME, $HASH, $HASHWORDS;

    
    for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = "";};

    foreach($words as $word=>$value) {
        if ($INDEXING_SCHEME == 3) { $subbound = strlen($word)-3; }
        else { $subbound = 1; }
        if (strlen($word)==3) {$subbound = 1;}
        $substring_length = 4;
        if ($INDEXING_SCHEME == 1) { $substring_length = strlen($word); }

        for ($i=0; $i<$subbound; $i++){
            $hash_value = abs(hash(substr($word,$i,$substring_length)) % $HASHSIZE);
    	    $hash_array[$hash_value] .= $value;
    	};   
        
    }



    $fp_HASH = fopen ("$HASH", "wb");
    $fp_HASHWORDS = fopen ("$HASHWORDS", "wb");

    $zzz = pack("N", 0);
    fwrite($fp_HASHWORDS, $zzz);
    $pos_hashwords = ftell($fp_HASHWORDS);
    $to_print_hash = "";
    $to_print_hashwords = "";

    for ($i=0; $i<$HASHSIZE; $i++){
    	
        if ($hash_array[$i] == "") {$to_print_hash .= $zzz;};
        if ($hash_array[$i] != "") {
            $to_print_hash .= pack("N",$pos_hashwords + strlen($to_print_hashwords));
            $to_print_hashwords .= pack("N", strlen($hash_array[$i])/8).$hash_array[$i];
        };   
        if (strlen($to_print_hashwords) > 64000) {
            fwrite($fp_HASH,$to_print_hash);
            fwrite($fp_HASHWORDS,$to_print_hashwords);
            $to_print_hash = "";
            $to_print_hashwords = "";
            $pos_hashwords  = ftell($fp_HASHWORDS);
        }
    }; # for $i
    fwrite($fp_HASH,$to_print_hash);
    fwrite($fp_HASHWORDS,$to_print_hashwords);
    
fclose($fp_HASH);
fclose($fp_HASHWORDS);


}
#=====================================================================

function hash($key) {

    $chars = preg_split("//",$key);
    for($i=1;$i<count($chars)-1;$i++) {
        $chars2[$i] = ord($chars[$i]);
    }
        
    $h = hexdec("00000000");
    $f = hexdec("F0000000");
    
    for($i=1;$i<count($chars)-1;$i++) {
        $h = ($h << 4) + $chars2[$i];
        if ($g = $h & $f) { $h ^= $g >> 24; };
        $h &= ~$g;
    }
    
    return $h;
    
}

#===================================================================

function getmicrotime(){ 
    list($usec, $sec) = explode(" ",microtime()); 
    return ((float)$usec + (float)$sec); 
    } 

#=====================================================================

function get_link($text) {
    
    $links = array();
    $count = preg_match_all("/<a[^>]+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER);
    for($i=0; $i < count($matches); $i++) {
        $links[$matches[$i][2]] = 1;
    }

    $count = preg_match_all("/<frame[^>]+src=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER);
    for($i=0; $i < count($matches); $i++) {
        $links[$matches[$i][2]] = 1;
    }

    $count = preg_match_all("/<area[^>]+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER);
    for($i=0; $i < count($matches); $i++) {
        $links[$matches[$i][2]] = 1;
    }


    return $links;
}

#=====================================================================

function get_absolute_url($base,$url) {

    $url_arr = parse_url($url);
    if (isset($url_arr["scheme"])) {
        return($url);
    }
    
    $base_arr = parse_url($base);
    $base_base = strtolower($base_arr["scheme"])."://";
    if (isset($base_arr["user"])) {
        $base_base .= $base_arr["user"].":".$base_arr["pass"]."@";
    }
    $base_base .= strtolower($base_arr["host"]);
    if (isset($base_arr["port"])) {
        $base_base .= ":".$base_arr["port"];
    }
    $base_path = $base_arr["path"];
    if ($base_path == "") { $base_path = "/"; }
    $base_path = preg_replace("/(.*\/).*/","\\1",$base_path);
    
    if ($url_arr["path"][0] == "/") {
        return $base_base.$url;
    }
    
    if (preg_match("'^\./'",$url)) {
        $url = preg_replace("'^\./'","",$url);
        return $base_base.$base_path.$url;
    }
    
    while (preg_match("'^\.\./'",$url)) {
        $url = preg_replace("'^\.\./'","",$url);
        $base_path = preg_replace("/(.*\/).*\//","\\1",$base_path);
    }
    return $base_base.$base_path.$url;    
}
#=====================================================================

function check_url($url) {
    
    global $file_ext, $no_index_files, $no_index_dir, $allow_url;

    if ( ! preg_match("'^http://'",$url)) { return FALSE; }
    if ( ! preg_match ("'$file_ext'i", $url)) { return FALSE; }
    if ( preg_match ("'$no_index_files'i", $url)) { return FALSE; }
    if ( preg_match ("'$no_index_dir'i", $url)) { return FALSE; }
    
    $allow = 0;
    foreach ($allow_url as $v) {
        if ( preg_match("'$v'i", $url)) {
            $allow = 1;
            break;
        }
    }
    if ($allow == 0) { return FALSE; }
    
    return TRUE;
}
#=====================================================================

function get_META_info($html) {

    preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches);
    $res[0] = $matches[1];
    preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches);
    $res[1] = $matches[1];

    return $res;
}
#=====================================================================



?>