/* OpenWebSpider
*
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.8
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
*
*
* This file is part of OpenWebSpider
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
*/

#ifndef __INDEXER
#define __INDEXER

extern int gDb2FatalError;

#ifndef OWS_MAX_CACHE_SOURCE_BYTES
#define OWS_MAX_CACHE_SOURCE_BYTES 100000
#endif

static int IsDb2TransientDisconnect(MYSQL *db)
{
    unsigned int errNo;
    const char *errMsg;

    if (!db)
        return 0;

    errNo = mysql_errno(db);
    if (errNo == 2006 || errNo == 2013 || errNo == 2055)
        return 1;

    errMsg = mysql_error(db);
    if (!errMsg)
        return 0;

    if (strstr(errMsg, "Server has gone away") ||
        strstr(errMsg, "Lost connection to MySQL server"))
        return 1;

    return 0;
}

static int QueryDb2WithReconnectRetry(char *sqlQuery)
{
    int ret;

    ret = my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX);
    if (ret == 0)
        return 0;

    if (!IsDb2TransientDisconnect(&gMysqlDB2))
        return ret;

    fprintf(stderr,
            "\n[WARN] DB2 transient disconnect: retry in corso (%s)\n",
            mysql_error(&gMysqlDB2));

    my_mysql_ping(&gMysqlDB2, BLOCKINDEX);
    my_mysql_query(&gMysqlDB2, "SET NAMES utf8mb4 COLLATE utf8mb4_unicode_ci", BLOCKINDEX);

    ret = my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX);
    if (ret == 0)
    {
        fprintf(stderr, "\n[WARN] DB2 retry riuscito: indicizzazione continua.\n");
        return 0;
    }

    return ret;
}

static void BuildUtf8SqlFallback(const char *in, char *out, size_t outSize)
{
    size_t i = 0, j = 0;

    if (outSize == 0)
        return;

    while (in && in[i] != '\0' && j < outSize - 1)
    {
        unsigned char c = (unsigned char)in[i++];

        /* normalize controls that often break text readability */
        if (c == '\r' || c == '\n' || c == '\t')
        {
            out[j++] = ' ';
        }
        else if (c < 0x80)
        {
            if (c >= 0x20)
                out[j++] = (char)c;
            else
                out[j++] = ' ';
        }
        else
        {
            unsigned int need = 0;
            unsigned int cp = 0;
            unsigned char c2, c3, c4;

            if (c >= 0xC2 && c <= 0xDF) {
                need = 1;
                cp = (unsigned int)(c & 0x1F);
            } else if (c >= 0xE0 && c <= 0xEF) {
                need = 2;
                cp = (unsigned int)(c & 0x0F);
            } else if (c >= 0xF0 && c <= 0xF4) {
                need = 3;
                cp = (unsigned int)(c & 0x07);
            } else {
                out[j++] = ' ';
                continue;
            }

            if (in[i] == '\0' || (need >= 2 && in[i + 1] == '\0') || (need == 3 && in[i + 2] == '\0')) {
                out[j++] = ' ';
                continue;
            }

            c2 = (unsigned char)in[i];
            if ((c2 & 0xC0) != 0x80) {
                out[j++] = ' ';
                continue;
            }

            if (need >= 2) {
                c3 = (unsigned char)in[i + 1];
                if ((c3 & 0xC0) != 0x80) {
                    out[j++] = ' ';
                    continue;
                }
            }
            if (need == 3) {
                c4 = (unsigned char)in[i + 2];
                if ((c4 & 0xC0) != 0x80) {
                    out[j++] = ' ';
                    continue;
                }
            }

            /* Reject overlongs/surrogates/out-of-range. */
            if (need == 1) {
                cp = (cp << 6) | (c2 & 0x3F);
                if (cp < 0x80) {
                    out[j++] = ' ';
                    continue;
                }
            } else if (need == 2) {
                cp = (cp << 6) | (c2 & 0x3F);
                cp = (cp << 6) | (c3 & 0x3F);
                if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
                    out[j++] = ' ';
                    continue;
                }
            } else {
                cp = (cp << 6) | (c2 & 0x3F);
                cp = (cp << 6) | (c3 & 0x3F);
                cp = (cp << 6) | (c4 & 0x3F);
                if (cp < 0x10000 || cp > 0x10FFFF) {
                    out[j++] = ' ';
                    continue;
                }
            }

            if (j + need + 1 >= outSize) {
                break;
            }

            out[j++] = (char)c;
            out[j++] = (char)c2;
            if (need >= 2) out[j++] = (char)c3;
            if (need == 3) out[j++] = (char)c4;
            i += need;
        }
    }

    out[j] = '\0';
}

static void LogDebugSqlThrottled(const char *sqlQuery)
{
    enum { SQL_DEBUG_EVERY_N = 20, SQL_DEBUG_MAX_CHARS = 1200 };
    static unsigned long totalSeen = 0;
    static unsigned long suppressed = 0;
    size_t len = 0;
    int toPrint = 0;

    totalSeen++;

    if (totalSeen == 1 || (totalSeen % SQL_DEBUG_EVERY_N) == 0)
        toPrint = 1;

    if (!toPrint)
    {
        suppressed++;
        return;
    }

    if (sqlQuery)
        len = strlen(sqlQuery);

    if (suppressed > 0)
    {
        fprintf(stderr,
                "[DEBUG SQL] throttle: %lu log SQL soppressi dall'ultimo dump\n",
                suppressed);
        suppressed = 0;
    }

    if (!sqlQuery)
    {
        fprintf(stderr, "[DEBUG SQL] (query nulla)\n");
        return;
    }

    if (len <= SQL_DEBUG_MAX_CHARS)
    {
        fprintf(stderr, "[DEBUG SQL] %s\n\n", sqlQuery);
    }
    else
    {
        fprintf(stderr,
                "[DEBUG SQL] %.1200s ... [troncata, len=%lu]\n\n",
                sqlQuery, (unsigned long)len);
    }
}

static void LogDb2QueryFailure(const char *context, const char *sqlQuery)
{
    unsigned int errNo = mysql_errno(&gMysqlDB2);
    const char *errMsg = mysql_error(&gMysqlDB2);

    if (!errMsg || errMsg[0] == '\0')
        errMsg = "<empty mysql_error>";

    printf("\r\nQuery Error in function IndexPage() [%s]: (%u) %s\r\n",
           context ? context : "query",
           errNo,
           errMsg);

    LogDebugSqlThrottled(sqlQuery);
}

/* DEFAULT: MySQL FULL-TEXT Index */

int IndexPage(char* html, struct sHost host, unsigned int htmlLength)
{
    if (gDb2FatalError)
    {
        static int warned = 0;

        if (!warned)
        {
            fprintf(stderr,
                    "\n[WARN] DB2 in errore fatale: IndexPage salta l'indicizzazione delle pagine.\n\n");
            warned = 1;
        }

        return 0;
    }

    char *cTmp;
    char *pureText;
    char tmpTitle[MAXDESCRIPTIONSIZE], title[MAXDESCRIPTIONSIZE];

    /* BUFFER PIÙ GRANDI PER L’ESCAPE SQL */
    char sanHostname[MAXHOSTSIZE * 2 + 1];
    char sanPage[MAXPAGESIZE * 2 + 1];
    char sanTitle[MAXDESCRIPTIONSIZE * 2 + 1];

    int usetitle = 0;
    char* sqlQuery;
    char *htmlcache = NULL;
    char *sanText = NULL;       /* testo escapato per SQL */
    unsigned int textLength = 0;

    int (*modFilter)(struct functArg*);

    if (bTesting == 1 || bDontIndexPages == 1)
        return 1;

    if (bUpdate == 1)    // -u ?
    {
        /* Is this page Indexed ? */
        if (IsPageIndexed(&host) == 1)
            return 0;   // Yes, don't re-index
    }

    cTmp     = (char*)malloc(MAXPACKETBUFSIZE);
    pureText = (char*)malloc(MAXPACKETBUFSIZE);

    // are we saving a cache? yes: prepare it
    htmlcache = NULL;
    if (xCacheHtml == 1)
    {
        size_t rawCacheLen = strlen(html);

        if (host.type != 1 || rawCacheLen > OWS_MAX_CACHE_SOURCE_BYTES)
        {
            htmlcache = NULL;
        }
        else
        {
            htmlcache = (char*)malloc((rawCacheLen + 1) * 2);
        }
        if (htmlcache)
        {
            thrdBlock(BLOCKINDEX);
            mysql_real_escape_string(&gMysqlDB2, htmlcache, html, rawCacheLen);
            thrdUnBlock(BLOCKINDEX);
        }
    }

    if (cTmp == NULL || pureText == NULL)
        MemoryCorruptedHandler("IndexPage");

    if (host.type == 1)    // HTML page
    {
        if (BetweenTag(html, "title", tmpTitle, 1, MAXDESCRIPTIONSIZE) > 0)
        {
            memset(title, 0, MAXDESCRIPTIONSIZE);
            snprintf(title, MAXDESCRIPTIONSIZE - 1, "%s", tmpTitle + 1);
            usetitle = 1;
        }

        /* Estraggo il testo puro dall’HTML */
        textLength = UnHtml(html, cTmp, MAXPACKETBUFSIZE);

        if (textLength >= MAXPACKETBUFSIZE)
            textLength = MAXPACKETBUFSIZE - 1;

        memcpy(pureText, cTmp, textLength);
        pureText[textLength] = '\0';
    }
    else if (host.type == 2)   // Plain text files
    {
        RemoveShit(html);
        OnlyOneSpace(html, pureText, MAXPACKETBUFSIZE);
        textLength = strlen(pureText);
    }
    else if (host.type == 4)   // Custom handled files
    {
        /* we empty pureText so the module can fill it with the text of the page */
        memset(pureText, 0, MAXPACKETBUFSIZE);
        textLength = 0;
    }
    else
    {
        /* impossible :-) */
        FREE(cTmp);
        FREE(pureText);
        FREE(htmlcache);

        return 0;
    }

    // are we using a regular expression filter?
    if (bUseRegularExpressionB == 1)
    {   // yes
        if (regexec(&regexContentFilter, pureText, 0, 0, 0) != 0)
        {
            FREE(cTmp);
            FREE(pureText);
            FREE(htmlcache);

            return 0;
        }
    }   // else continue

    {
        unsigned int modFilterCount;
        unsigned int modFilterIndex;

        modFilterCount = GetModFunctionHandlerCountByName("modFilter");
        for (modFilterIndex = 0; modFilterIndex < modFilterCount; modFilterIndex++)
        {
            struct functArg tmpModArg;
            char dbg[1024];
            char *moduleFilename;

            modFilter = GetModFunctionHandlerByNameAt("modFilter", modFilterIndex);
            if (!modFilter)
                continue;
            moduleFilename = GetLoadedModuleFilenameByIndex(modFilterIndex);

            tmpModArg.hostInfo   = &host;
            tmpModArg.html       = html;
            tmpModArg.htmlLength = htmlLength;
            tmpModArg.text       = pureText;
            tmpModArg.textLength = textLength;
            tmpModArg.mysqlDB1   = &gMysqlDB1;
            tmpModArg.mysqlDB2   = &gMysqlDB2;

            thrdBlock(BLOCKDB1);
            thrdBlock(BLOCKINDEX);

            if (modFilter(&tmpModArg) == 0)
            {
                if (host.type == 4)
                {
                    snprintf(dbg,sizeof(dbg),
                             "module skip host=%.100s page=%.255s module=%.255s mime=%.127s bytes=%u",
                             host.Host,
                             host.Page,
                             moduleFilename ? moduleFilename : "(unknown)",
                             host.HttpContentType,
                             htmlLength);
                    DEBUG_LOG(dbg);
                }
                thrdUnBlock(BLOCKDB1);
                thrdUnBlock(BLOCKINDEX);
                FREE(cTmp);
                FREE(pureText);
                FREE(htmlcache);

                return 0;
            }

            thrdUnBlock(BLOCKDB1);
            thrdUnBlock(BLOCKINDEX);

            if (host.type == 4)
            {
                snprintf(dbg,sizeof(dbg),
                         "module handled host=%.100s page=%.255s module=%.255s mime=%.127s bytes=%u text=%u",
                         host.Host,
                         host.Page,
                         moduleFilename ? moduleFilename : "(unknown)",
                         host.HttpContentType,
                         htmlLength,
                         tmpModArg.textLength);
                DEBUG_LOG(dbg);
            }

            if (tmpModArg.textLength > 0 && tmpModArg.textLength < MAXPACKETBUFSIZE)
                textLength = tmpModArg.textLength;
            else
                textLength = strlen(pureText);

            if (textLength >= MAXPACKETBUFSIZE)
                textLength = MAXPACKETBUFSIZE - 1;
            pureText[textLength] = '\0';
        }
    }

    /* here we have text that could be dirty so we must clean it */
    if (host.type == 4)
    {
        RemoveShit(pureText);
        textLength = strlen(pureText);
    }

    sqlQuery = malloc(MAXQUERYSIZE);
    if (sqlQuery == NULL)
        MemoryCorruptedHandler("IndexPage");

    /* PULIAMO IL TITOLO/DESCRIZIONE E POI LO ESCAPIAMO PER SQL */
    if (usetitle == 1)
        RemoveShit(title);
    else
        RemoveShit(host.Description);

    const char *rawTitle = (usetitle == 1) ? title : host.Description;
    size_t titleLen = strlen(rawTitle);
    if (titleLen >= MAXDESCRIPTIONSIZE)
        titleLen = MAXDESCRIPTIONSIZE - 1;

    thrdBlock(BLOCKINDEX);
    mysql_real_escape_string(&gMysqlDB2, sanTitle, rawTitle, titleLen);
    sanTitle[titleLen * 2] = '\0'; /* mysql_real_escape_string già mette \0, ma per sicurezza */

    /* Escapare host e page (buffer più grandi) */
    mysql_real_escape_string(&gMysqlDB2, sanHostname, host.Host, strlen(host.Host));
    mysql_real_escape_string(&gMysqlDB2, sanPage, host.Page, strlen(host.Page));

    /* Escapare il testo per il campo `text` */
    sanText = (char*)malloc(MAXPACKETBUFSIZE * 2 + 1);
    if (sanText == NULL)
        MemoryCorruptedHandler("IndexPage");

    mysql_real_escape_string(&gMysqlDB2, sanText, pureText, textLength);
    thrdUnBlock(BLOCKINDEX);

    memset(sqlQuery, 0, MAXQUERYSIZE);
    snprintf(
        sqlQuery,
        MAXQUERYSIZE,
        "INSERT DELAYED INTO %s SET host_id = %d, hostname = '%s', page='%s',"
        "title=TRIM('%s'), date=curdate(), time=curtime(), version=%i, level=%i,"
        "`text`=TRIM('%s');",
        gTable,
        GetHostId(host),
        sanHostname,
        sanPage,
        sanTitle,
        DBVERSION,
        host.level,
        sanText
    );

    my_mysql_ping(&gMysqlDB2, BLOCKINDEX);
    if (QueryDb2WithReconnectRetry(sqlQuery))
    {
        const char *dbErr = mysql_error(&gMysqlDB2);
        int retriedWithFallback = 0;

        if (dbErr && strstr(dbErr, "Incorrect string value"))
        {
            char fallbackTitle[MAXDESCRIPTIONSIZE];
            char *fallbackText = (char*)malloc(MAXPACKETBUFSIZE);

            if (fallbackText)
            {
                BuildUtf8SqlFallback(rawTitle, fallbackTitle, sizeof(fallbackTitle));
                BuildUtf8SqlFallback(pureText, fallbackText, MAXPACKETBUFSIZE);

                thrdBlock(BLOCKINDEX);
                mysql_real_escape_string(&gMysqlDB2, sanTitle, fallbackTitle, strlen(fallbackTitle));
                mysql_real_escape_string(&gMysqlDB2, sanText, fallbackText, strlen(fallbackText));
                thrdUnBlock(BLOCKINDEX);

                memset(sqlQuery, 0, MAXQUERYSIZE);
                snprintf(
                    sqlQuery,
                    MAXQUERYSIZE,
                    "INSERT DELAYED INTO %s SET host_id = %d, hostname = '%s', page='%s',"
                    "title=TRIM('%s'), date=curdate(), time=curtime(), version=%i, level=%i,"
                    "`text`=TRIM('%s');",
                    gTable,
                    GetHostId(host),
                    sanHostname,
                    sanPage,
                    sanTitle,
                    DBVERSION,
                    host.level,
                    sanText
                );

                my_mysql_ping(&gMysqlDB2, BLOCKINDEX);
                if (!QueryDb2WithReconnectRetry(sqlQuery))
                {
                    retriedWithFallback = 1;
                    fprintf(stderr,
                            "\n[WARN] IndexPage retry fallback: page salvata con testo ripulito (%s%s)\n\n",
                            host.Host, host.Page);
                }
                FREE(fallbackText);
            }
        }

        if (retriedWithFallback == 0)
        {
            if (IsDb2TransientDisconnect(&gMysqlDB2))
            {
                fprintf(stderr,
                        "\n[WARN] DB2 ancora non disponibile dopo retry: salto pagina e continuo (%s%s)\n",
                        host.Host, host.Page);

                FREE(cTmp);
                FREE(pureText);
                if (htmlcache)
                    FREE(htmlcache);
                FREE(sqlQuery);
                FREE(sanText);

                return 0;   /* skip pagina, ma niente stato fatale */
            }

            ERROR_LOG(mysql_error(&gMysqlDB2))
            ERROR_LOG(sqlQuery)
            LogDb2QueryFailure("insert", sqlQuery);

            FREE(cTmp);
            FREE(pureText);
            if (htmlcache)
                FREE(htmlcache);
            FREE(sqlQuery);
            FREE(sanText);

            return 0;   /* pagina non indicizzata: continua comunque lo scan */
        }
    }

    /* this page is indexed correctly */

    FREE(cTmp);
    FREE(pureText);
    FREE(sanText);

    if (htmlcache && xCacheHtml == 1)  // saves html cache
    {
        size_t cacheSqlLen;

        cacheSqlLen = strlen(gTable) + strlen(htmlcache) + strlen(sanHostname) + strlen(sanPage) + 96;

        if (host.type != 1)
        {
            fprintf(stderr,
                    "\n[WARN] Cache skip: non-HTML asset non salvato in cache (%s%s)\n",
                    host.Host,
                    host.Page);
        }
        else if (cacheSqlLen >= MAXQUERYSIZE)
        {
            fprintf(stderr,
                    "\n[WARN] Cache skip: pagina troppo grande per query cache (%s%s, escaped=%lu)\n",
                    host.Host,
                    host.Page,
                    (unsigned long)strlen(htmlcache));
        }
        else
        {
            if (xCacheHtmlCompressed == 1)
                snprintf(sqlQuery, MAXQUERYSIZE,
                         "UPDATE %s SET `cache`=COMPRESS('%s') WHERE hostname='%s' and page='%s';",
                         gTable, htmlcache, sanHostname, sanPage);
            else
                snprintf(sqlQuery, MAXQUERYSIZE,
                         "UPDATE %s SET `cache`='%s' WHERE hostname='%s' and page='%s';",
                         gTable, htmlcache, sanHostname, sanPage);

            if (QueryDb2WithReconnectRetry(sqlQuery))
            {
                ERROR_LOG(mysql_error(&gMysqlDB2))
                LogDb2QueryFailure("cache update", sqlQuery);
            }
        }

        FREE(htmlcache);
    }

    FREE(sqlQuery);

    return 1;
}


/* IsPageIndexed
*  if the page exists returns its id
*  else returns 0
*/
int IsPageIndexed(struct sHost* host)
{
	char* sqlQuery;
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	int ret=1;
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
	if(tmpRes==NULL || sqlQuery==NULL)
		MemoryCorruptedHandler("IsPageIndexed");
	
	sprintf(sqlQuery,"SELECT id FROM pagelist WHERE host_id = %d AND page='%s' LIMIT 1",GetHostId(*host), host->Page);
	my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);
	
	if(mysql_affected_rows(&gMysqlDB2)==0)	//Page is not indexed -> return 0
		ret = 0;
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);
	FREE(sqlQuery);
	
	return ret;
}

/* Takes a Text and convert all special characters to UTF-8 */
int sqlTextToUTF8(char* text, char* out, int maxout)
{
	int x,y;
	int textLen;
	unsigned char curC;
	char* aass;
	char ssaa[10];
	int cont;
	int bAscFound;
	
	textLen=strlen(text);
	
	memset(out,0,maxout);
	
	y=0;
	
	//out[y++]='\'';
	
	for(x=0; x<textLen && y<maxout ;x++)
	{
		curC=text[x];
		if(curC=='&')
		{
			bAscFound=0;
			
			aass=strchr(text+x,';');
			if(aass && aass-(text+x) < 10)
			{
				memset(ssaa,0,10);
				strncpy(ssaa,text+x+1, (aass-(text+x))-1 );
				
				if(ssaa[0]=='#')
				{
					char val[10];
					if(ssaa[1]=='x')
						strcpy(val,ssaa+2);
					else
						sprintf(val,"%X",atoi(ssaa+1));
					
					if(y+21+strlen(val)+26<maxout)
					{
						strcat(out,"', CONVERT(CONVERT(0x");
                        strcat(out,val);
                        strcat(out," using UCS2) using utf8mb4),'");
						y+=21+strlen(val)+26;
						
						bAscFound=1;
					}
					else
						return 0;
					
					x+=strlen(ssaa)+1;
					continue;					
				}
				
				cont=0;
				
				while(ahList[cont].htmlChar && bAscFound==0 )
				{
					if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )
					{
						if(ahList[cont].type==1)	/*ascii*/
						{
							if(y+strlen(ahList[cont].rep)<maxout)
							{
								strcat(out,ahList[cont].rep);
								y+=strlen(ahList[cont].rep);
								bAscFound=1;
							}
							else
								return 0;
						}
						else						/*UTF8*/
						{
							if(y+11+strlen(ahList[cont].rep)+14<maxout)
							{
								strcat(out,"', CONVERT(");
                                strcat(out,ahList[cont].rep);
                                strcat(out," using utf8mb4),'");

								y+=11+strlen(ahList[cont].rep)+14;
								bAscFound=1;
							}
							else
								return 0;
						}
						x+=strlen(ssaa)+1;
						
					}	/*if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )*/
					
					cont++;
					
				}	/*while*/
				
			}	/*if(aass && aass-(text+x) < 10)*/
			
			if(bAscFound==0)
				out[y++]='&';
			
		}
		else
            out[y++]=curC;
		
	}
	
	return 1;
}

/*****************************************************************************************/
/* OOI: ows own index */

/*
flag = 0 = index only host.Page
flag = 1 = index all pages from the domain pointed by host.Host
flag = 2 = index all un-indexed pages
*/
int BuildOwsOwnIndex(struct sHost* host, unsigned int flag)
{
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	MYSQL_ROW row;
	char* sqlQuery;
	OOI_NODE* lexicon;
	unsigned int res_elements, counter = 0;
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	sqlQuery = malloc(MAXQUERYSIZE);

	lexicon_number_of_elements = 0;
	lexicon_actual_size = LEXICONWORDSIZE;
	lexicon = InitLexicon();

	switch(flag)
	{
		case 0:
			sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND pagelist.page =\'%s\' AND ii.pageid = pagelist.id ",host->Host, host->Page);
			my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);

			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id, text FROM pagelist WHERE hostname = \'%s\' and page = \'%s\' ",host->Host, host->Page);
		break;
		case 1:
			sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND ii.pageid = pagelist.id ",host->Host);
			my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);

			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id, text FROM pagelist WHERE hostname =\'%s\' ",host->Host);
		break;
		case 2:
			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT pagelist.id, pagelist.text FROM pagelist LEFT OUTER JOIN view_ii on pagelist.id = view_ii.pageid where view_ii.pageid is NULL ");
		break;
	}
		
	my_mysql_query_and_store_results(&gMysqlDB2,sqlQuery,tmpRes,&gRes,NO_BLOCK);
	
	FREE(sqlQuery);

    res_elements = mysql_affected_rows(&gMysqlDB2);
	
	if(flag > 0)
		printf("Building OpenWebSpider Own Index (0 docs of %i)...          ", res_elements);

	fflush(stdout);
	
	
	while( (row = mysql_fetch_row(&gRes)) )
	{
		IndexPage2((char*)row[1], atoi(row[0]),&lexicon);
		counter++;
		if(counter % 10 == 0 || counter == res_elements)
			printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)...          ", counter, res_elements);

		/* every OWSINDEXMAXSWAPDELAY pages swap the index to te DB and reinit the structures */
		if(counter % OWSINDEXMAXSWAPDELAY == 0)
		{
			StoreOwsIndex(lexicon);

			FreeOwsIndex(lexicon);

			/* reinit all */
			lexicon_number_of_elements = 0;
			lexicon_actual_size = LEXICONWORDSIZE;
			lexicon = InitLexicon();

			printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)...          ", counter, res_elements);
		}
	}
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);

	printf("\r\n");

	StoreOwsIndex(lexicon);

	FreeOwsIndex(lexicon);

	printf("\r\n");

	return 1;
}


int IndexPage2(char* text, unsigned int page_id, OOI_NODE** lexicon)
{
	char* pCh = NULL;
	unsigned int wordLen;
	unsigned int position = 0;
	
	/* step 1: we split all tokens */
	pCh = strtok (text,INDEXERTOKENS);
	
	if(pCh==NULL || pCh[0]==0)
		return 0;
	
	while(pCh != NULL)
	{
		wordLen = strlen(pCh);
		if(wordLen>OWSINDEXMINWORDSIZE && wordLen<OWSINDEXMAXWORDSIZE)
		{
			if(ndzLookForWord(*lexicon,_strupr(pCh))==-1)	//Add unique word
				lstAddWord(lexicon,pCh);
			
			UpdateInvertedIndex(*lexicon, pCh,page_id, position);
			
			position ++ ;
		}

		pCh = strtok (NULL, INDEXERTOKENS);
	}
	
	return 1;
}

void UpdateInvertedIndex(OOI_NODE* lexicon, char* word, unsigned int doc_id, unsigned int position)
{
	int pos;
	INVERTED_INDEX* ii;
	INVERTED_INDEX* last;
	
	pos = ndzLookForWord(lexicon, word);
	
	if(pos==-1)
		return;
	
	ii = lexicon[pos].ii;
	
	last = ii->last;
	
	last->next = malloc(sizeof(INVERTED_INDEX));
	last->next->doc_id = doc_id;
	last->next->position = position;
	last->next->next = NULL;
	
	ii->last = last->next;
}

/* GetWordId
*  if the page exists returns its id
*  else returns 0
*/
int GetWordId(char* word)
{
	char* sqlQuery;
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	MYSQL_ROW row;
	unsigned int ret;
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
	if(tmpRes==NULL || sqlQuery==NULL)
		MemoryCorruptedHandler("GetWordId");
	
	snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.wordlist WHERE word='%s' LIMIT 1", DB2, word);
	
	my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);
	
	FREE(sqlQuery);
	
	row = mysql_fetch_row(&gRes);
	
	if(row)
		ret = atoi(row[0]);
	else
		ret = 0;
	
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);
	
	
	return ret;
}

void StoreOwsIndex(OOI_NODE* lexicon)
{
	MYCSTR AsqlQuery;
	unsigned int i;
	INVERTED_INDEX* ii;
	char* sqlQuery;
	unsigned int word_id;
	char strTmp[50];
	unsigned int isFirst;

	/*init*/
	AsqlQuery.myString=NULL;

	sqlQuery = malloc(MAXQUERYSIZE);

	my_mysql_ping(&gMysqlDB2,BLOCKINDEX);
	
	for(i=0;i<lexicon_number_of_elements;i++)
	{
		//printf("\n%i -- %s\n",lexicon[i].id, lexicon[i].field);

		if( (i+1) % 500 == 0 || i == lexicon_number_of_elements-1)
			printf("\rStoring OpenWebSpider Index to the DB(%i words of %i)...          ",i+1 , lexicon_number_of_elements);
		
		//Add word (the table has an unique index on the field word)
		snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO %s.wordlist (word) VALUES('%s')", DB2, lexicon[i].field);
		
		my_mysql_query(&gMysqlDB2, sqlQuery, NO_BLOCK);
		
		/* *** */
		
		word_id = GetWordId(lexicon[i].field);
		
		/* is the word in the DB? */
		if(word_id > 0)
		{
			myCStrCpy(&AsqlQuery, "INSERT INTO ");
			myCStrCat(&AsqlQuery, DB2);
			myCStrCat(&AsqlQuery, ".ii (wordid, pageid, position) VALUES");
			isFirst = 1;

			ii = lexicon[i].ii;
			if(lexicon[i].ii && lexicon[i].ii->last && lexicon[i].ii->last != lexicon[i].ii)
			{
				while(ii != NULL)
				{
					if(ii->doc_id>0)
					{
						if(isFirst)
						{
							snprintf(strTmp,50,"(%i,%i,%i)",word_id, ii->doc_id, ii->position );
							isFirst=0;
						}
						else
							snprintf(strTmp,50,",(%i,%i,%i)",word_id, ii->doc_id, ii->position );

						myCStrCat(&AsqlQuery, strTmp);
					}
					ii = ii->next;
				}

				if(my_mysql_query(&gMysqlDB2, AsqlQuery.myString, NO_BLOCK))
				{
					ERROR_LOG(mysql_error(&gMysqlDB2))
					ERROR_LOG(AsqlQuery.myString)
					printf("\r\nQuery Error in function StoreOwsIndex(): %s\r\n",mysql_error(&gMysqlDB2));
				}
			}		
		}
	}
	
	FREE(sqlQuery);
	FREE(AsqlQuery.myString);
}

#endif

/*EOF*/
