
/* OpenWebSpider
*
*  Authors:     Stefano Alimonti AND Stefano Fantin
*  Version:     0.8
*  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
*
*
* This file is part of OpenWebSpider
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
*/

#ifndef __URLFUNCT
#define __URLFUNCT

extern int gDb2FatalError;

/* ReturnFirstUrl
* Host <-
* Set current Host as indexed and return the first host found in the list if available
*/

int ReturnFirstUrl(struct sHost* Host)
{
    MYSQL_ROW   row;
    char        sqlQuery[MAXQUERYSIZE];
    MYSQL_RES   gRes;
    MYSQL_RES **tmpRes = NULL;
    MYSQL_RES  *res    = NULL;

    if (Host == NULL)
    {
        printf("Critical error\r\n\r\n");
        return -1;
    }

    my_mysql_ping(&gMysqlDB1, BLOCKDB1);

    /* segna l'host corrente come indicizzato */
    if (Host->Host[0] != 0)
    {
        sprintf(sqlQuery,
                "UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,"
                "bytes_downloaded=%llu, error_pages=%d WHERE hostname = '%s' limit 1",
                nPagesViewed,
                (int)((GetTickCount() - startTimeMS) / 1000),
                (unsigned long long)bytesDownloaded,
                nErrorPages,
                Host->Host);

        my_mysql_query(&gMysqlDB1, sqlQuery, BLOCKDB1);
    }

    printStats(Host, (iDoNextHost == 1) ? 2 : 0);

    if (starthostonly == 1)
    {
        iQuit = 1;
        return 1;
    }

    if (nextHost)
    {
        AddExternalHost(*nextHost, NULL);

        sprintf(sqlQuery,
                "SELECT hostname, page, port "
                "FROM hostlist "
                "WHERE hostname='%s' AND port=%d AND status=0 "
                "ORDER BY priority DESC LIMIT 1",
                nextHost->Host, nextHost->port);

        FREE(nextHost);
        nextHost = NULL;
    }
    else
    {
        sprintf(sqlQuery,
                "SELECT hostname, page, port "
                "FROM hostlist "
                "WHERE status=0 "
                "ORDER BY priority DESC, id LIMIT 1");
    }

    tmpRes = (MYSQL_RES**)malloc(sizeof(MYSQL_RES*));
    if (tmpRes == NULL)
        MemoryCorruptedHandler("ReturnFirstUrl");

    *tmpRes = NULL;

    my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery, tmpRes, &gRes, BLOCKDB1);

    res = (tmpRes ? *tmpRes : NULL);

    if (res == NULL)
    {
        FREE(tmpRes);
        return -1;
    }

    if (mysql_num_rows(res) == 0)
    {
        mysql_free_result(res);
        FREE(tmpRes);
        return -1;
    }

    row = mysql_fetch_row(res);
    if (row == NULL || row[0] == NULL || row[1] == NULL)
    {
        mysql_free_result(res);
        FREE(tmpRes);
        return -1;
    }

    /* row[0] = hostname, row[1] = page, row[2] = port */
    {
        struct sHost tmpHost;
        memset(&tmpHost, 0, sizeof(tmpHost));

        strncpy(tmpHost.Host, row[0], MAXHOSTSIZE - 1);
        tmpHost.Host[MAXHOSTSIZE - 1] = '\0';

        strncpy(tmpHost.Page, row[1], MAXPAGESIZE - 1);
        tmpHost.Page[MAXPAGESIZE - 1] = '\0';

        tmpHost.port = (row[2] ? atoi(row[2]) : PORT);
        tmpHost.isSSL = (tmpHost.port == 443) ? 1 : 0;

        tmpHost.host_id = GetHostId(tmpHost);

        if (CheckPage(&tmpHost) == -1)
        {
            mysql_free_result(res);
            FREE(tmpRes);
            return -1;
        }

        PageType(&tmpHost);

        *Host = tmpHost;
    }

    mysql_free_result(res);
    FREE(tmpRes);

    return 1;
}




static int OwsPageEndsWithAnyAssetSuffix(const char *page)
{
    static const char *exts[] = {
        ".pdf",
        ".jpg",".jpeg",".jpe",".jfif",".pjpeg",".png",".apng",".gif",".webp",".svg",
        ".bmp",".ico",".cur",".tif",".tiff",".avif",".heic",".heif",".jp2",".j2k",".jpf",".jpx",".jxl",
        ".mp4",".m4v",".webm",".ogv",".mov",".avi",".mkv",".flv",".wmv",".asf",
        ".ts",".m2ts",".mpeg",".mpg",".mpe",".vob",".3gp",".3g2",".m3u8",".mpd",
        ".mp3",".m4a",".m4b",".aac",".adts",".oga",".ogg",".opus",".wav",".flac",".wma",".weba",
        ".aif",".aiff",".mid",".midi",".vtt",".srt","\0"
    };
    char pageOnly[MAXPAGESIZE];
    const char *qmark;
    const char *hash;
    size_t len;
    int i;

    if (!page || !page[0])
        return 0;
    qmark = strchr(page, '?');
    hash = strchr(page, '#');
    if (qmark && hash)
        len = (qmark < hash) ? (size_t)(qmark - page) : (size_t)(hash - page);
    else if (qmark)
        len = (size_t)(qmark - page);
    else if (hash)
        len = (size_t)(hash - page);
    else
        len = strlen(page);
    if (len >= sizeof(pageOnly))
        len = sizeof(pageOnly) - 1;
    memcpy(pageOnly, page, len);
    pageOnly[len] = '\0';

    for (i = 0; exts[i][0] != '\0'; i++)
    {
        size_t pl = strlen(pageOnly);
        size_t el = strlen(exts[i]);
        if (pl >= el && stricmp(pageOnly + (pl - el), (char*)exts[i]) == 0)
            return 1;
    }
    return 0;
}

static int OwsContainsCiUrl(const char *hay, const char *needle)
{
    size_t i, j, hn, nn;
    if (!hay || !needle)
        return 0;
    hn = strlen(hay);
    nn = strlen(needle);
    if (nn == 0 || hn < nn)
        return 0;
    for (i = 0; i <= hn - nn; i++)
    {
        for (j = 0; j < nn; j++)
        {
            if (tolower((unsigned char)hay[i + j]) != tolower((unsigned char)needle[j]))
                break;
        }
        if (j == nn)
            return 1;
    }
    return 0;
}

static int OwsPageLooksLikeDynamicAsset(const char *page)
{
    if (!page || !page[0])
        return 0;
    if (OwsContainsCiUrl(page, "image") || OwsContainsCiUrl(page, "img") ||
        OwsContainsCiUrl(page, "photo") || OwsContainsCiUrl(page, "picture") ||
        OwsContainsCiUrl(page, "thumb") || OwsContainsCiUrl(page, "thumbnail") ||
        OwsContainsCiUrl(page, "gallery") || OwsContainsCiUrl(page, "video") ||
        OwsContainsCiUrl(page, "audio") || OwsContainsCiUrl(page, "media") ||
        OwsContainsCiUrl(page, "stream") || OwsContainsCiUrl(page, "playlist") ||
        OwsContainsCiUrl(page, "player") || OwsContainsCiUrl(page, "embed") ||
        OwsContainsCiUrl(page, "podcast") || OwsContainsCiUrl(page, "download") ||
        OwsContainsCiUrl(page, "attachment") || OwsContainsCiUrl(page, "asset") ||
        OwsContainsCiUrl(page, "assets") || OwsContainsCiUrl(page, "static") ||
        OwsContainsCiUrl(page, "cdn") || OwsContainsCiUrl(page, "blob") ||
        OwsContainsCiUrl(page, "resource") || OwsContainsCiUrl(page, "resources") ||
        OwsContainsCiUrl(page, "upload") || OwsContainsCiUrl(page, "uploads") ||
        OwsContainsCiUrl(page, "file") || OwsContainsCiUrl(page, "files") ||
        OwsContainsCiUrl(page, "export") || OwsContainsCiUrl(page, "fetch") ||
        OwsContainsCiUrl(page, "render") || OwsContainsCiUrl(page, "proxy") ||
        OwsContainsCiUrl(page, "api") || OwsContainsCiUrl(page, "binary") ||
        OwsContainsCiUrl(page, "raw") || OwsContainsCiUrl(page, "original") ||
        OwsContainsCiUrl(page, "full") || OwsContainsCiUrl(page, "large") ||
        OwsContainsCiUrl(page, "size=") || OwsContainsCiUrl(page, "format=") ||
        OwsContainsCiUrl(page, "mime") || OwsContainsCiUrl(page, "content") ||
        OwsContainsCiUrl(page, "pdf") || OwsContainsCiUrl(page, "document") ||
        OwsContainsCiUrl(page, "doc") || OwsContainsCiUrl(page, "archive"))
	        return 1;
    return 0;
}

static int OwsIsAssetCandidateHost(const struct sHost *hst)
{
    if (!hst)
        return 0;
    if (hst->type == 4)
        return 1;
    if (OwsPageEndsWithAnyAssetSuffix(hst->Page))
        return 1;
    if (OwsPageLooksLikeDynamicAsset(hst->Page))
        return 1;
    return 0;
}

static int OwsQueueInMemoryIfNew(struct sHost hst, unsigned int level)
{
    if (lstGetNodeByHost(lstFirst,hst)==NULL)
    {
        hst.level = level + 1;
        lstAddHost(&lstFirst,hst);
        return 1;
    }
    return 0;
}

static void OwsProfileRecordQueuedType(const struct sHost *hst)
{
    if (!hst)
        return;
    if (hst->type == 1)
        gProfileQueuedTypeHtml++;
    else if (hst->type == 2)
        gProfileQueuedTypePlain++;
    else if (hst->type == 4)
        gProfileQueuedTypeAsset++;
    else
        gProfileQueuedTypeOther++;
}

static int OwsIsCrawlerNoiseUrl(const struct sHost *hst)
{
    if (!hst || !hst->Page[0])
        return 0;

    if (strnicmp(hst->Page, "/cdn-cgi/challenge-platform/", 28) == 0 ||
        strnicmp(hst->Page, "/cdn-cgi/l/chk_jschl", 20) == 0)
        return 1;

    return 0;
}

/* AddUrl
*/
int AddUrl(struct sHost hst, unsigned int level,struct sHost* from)
{
	char* sqlQuery; 
    int isAssetCandidate;
    char profileMsg[1024];
    char jsonMsg[1536];
    char jHost[160], jPage[360], jFrom[160];
	
    isAssetCandidate = OwsIsAssetCandidateHost(&hst);
    gProfileDiscoveredTotal++;
    if(isAssetCandidate)
        gProfileAssetCandidates++;
    if(OwsPageLooksLikeDynamicAsset(hst.Page))
        gProfileDynamicCandidates++;
    if(from && stricmp(from->Host,hst.Host)!=0)
        gProfileCrossHostDiscovered++;
    OwsJsonEscapeCopy(hst.Host, jHost, sizeof(jHost));
    OwsJsonEscapeCopy(hst.Page, jPage, sizeof(jPage));
    OwsJsonEscapeCopy(from ? from->Host : "", jFrom, sizeof(jFrom));

    if (OwsIsCrawlerNoiseUrl(&hst))
    {
        gProfileRejectedLimits++;
        snprintf(profileMsg,sizeof(profileMsg),
                 "queue reject crawler_noise host=%.100s page=%.255s level=%u type=%d from=%.100s",
                 hst.Host,hst.Page,level,hst.type,(from ? from->Host : ""));
        CRAWLER_PROFILE_LOG(profileMsg);
        snprintf(jsonMsg,sizeof(jsonMsg),
                 "{\"event\":\"queue\",\"decision\":\"reject\",\"reason\":\"crawler_noise\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d}",
                 jHost,jPage,jFrom,level,hst.type);
        CRAWLER_PROFILE_JSONL(jsonMsg);
        return -1;
    }
	
	/* if the host of the current page is the same of that we are indexing and Free Indexing Mode is off*/
	/* bFreeIndexingMode == 1 == Index all pages of the current host and not */
	if(stricmp(IndexingHost.Host,hst.Host)!=0 && bFreeIndexingMode==0)
	{
		AddExternalHost(hst,from);
        if(isAssetCandidate && checkLimits() == 0 && iDoNextHost != 1)
        {
            if(hst.host_id==0)
                hst.host_id = GetHostId(hst);
            if(OwsQueueInMemoryIfNew(hst, level)==1)
            {
                gProfileAcceptedTotal++;
                gProfileExternalAssetsQueued++;
                OwsProfileRecordQueuedType(&hst);
                snprintf(profileMsg,sizeof(profileMsg),
                         "queue accept external_asset host=%.100s page=%.255s level=%u type=%d from=%.100s",
                         hst.Host,hst.Page,level,hst.type,(from ? from->Host : ""));
                CRAWLER_PROFILE_LOG(profileMsg);
                snprintf(jsonMsg,sizeof(jsonMsg),
                         "{\"event\":\"queue\",\"decision\":\"accept\",\"reason\":\"external_asset\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d,\"dynamic\":%d}",
                         jHost,jPage,jFrom,level,hst.type,isAssetCandidate,OwsPageLooksLikeDynamicAsset(hst.Page));
                CRAWLER_PROFILE_JSONL(jsonMsg);
            }
            else
            {
                gProfileRejectedDuplicate++;
                snprintf(profileMsg,sizeof(profileMsg),
                         "queue reject duplicate_external_asset host=%.100s page=%.255s level=%u type=%d",
                         hst.Host,hst.Page,level,hst.type);
                CRAWLER_PROFILE_LOG(profileMsg);
                snprintf(jsonMsg,sizeof(jsonMsg),
                         "{\"event\":\"queue\",\"decision\":\"reject\",\"reason\":\"duplicate_external_asset\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d}",
                         jHost,jPage,jFrom,level,hst.type,isAssetCandidate);
                CRAWLER_PROFILE_JSONL(jsonMsg);
            }
        }
        else
        {
            snprintf(profileMsg,sizeof(profileMsg),
                     "queue external_db_only host=%.100s page=%.255s level=%u type=%d asset=%d limits=%d switch=%d",
                     hst.Host,hst.Page,level,hst.type,isAssetCandidate,checkLimits(),iDoNextHost);
            CRAWLER_PROFILE_LOG(profileMsg);
            snprintf(jsonMsg,sizeof(jsonMsg),
                     "{\"event\":\"queue\",\"decision\":\"external_db_only\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d,\"limits\":%d,\"switch\":%d}",
                     jHost,jPage,jFrom,level,hst.type,isAssetCandidate,checkLimits(),iDoNextHost);
            CRAWLER_PROFILE_JSONL(jsonMsg);
        }
	}
	else
	{
		
		/* if we are in the free indexing mode we will index this page as it was of the current indexing host but we must add this host to the table hostlist */
		/* and we must delete the current page from the Index (pagelist) */
		if(bFreeIndexingMode==1)
		{
			AddExternalHost(hst,from);
			
			sqlQuery = malloc(MAXQUERYSIZE);
			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"DELETE FROM pagelist WHERE hostname =\'%s\' AND page=\'%s\'",hst.Host, hst.Page);
			my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX);
			FREE(sqlQuery);
		}
        else
        {
            /* we are in the same host */
				if(from)
	            hst.host_id = from->host_id;
        }

        if(hst.host_id==0)
            hst.host_id = GetHostId(hst);
		
		if(nRelationships==2)
		{
			pRelationships(from,&hst,nRelationships);
		}
		
		/* Check the current page against the robots.txt, 
            the maximum level of depth and the maximum number of pages to be indexed,
            the number of seconds,
            the number of bytes downloaded
            or if we are switching to the next host */
        {
            int blockedRobot = (CheckRobotExclusion(hst.Page)==0);
            int blockedLimits = (checkLimits() == 1);
            int blockedDepth = ((!isAssetCandidate && EXTRA_LIMITS.nMaxDepthLevel == 0 && CRAWLER_LIMITS.nMaxDepthLevel>0 && level >= CRAWLER_LIMITS.nMaxDepthLevel) ||
                                (!isAssetCandidate && EXTRA_LIMITS.nMaxDepthLevel>0 && level >= EXTRA_LIMITS.nMaxDepthLevel));
            int blockedSwitch = (iDoNextHost==1);

            if(blockedRobot || blockedLimits || blockedDepth || blockedSwitch)
            {
                if(blockedRobot) gProfileRejectedRobots++;
                if(blockedLimits) gProfileRejectedLimits++;
                if(blockedDepth) gProfileRejectedDepth++;
                if(blockedSwitch) gProfileRejectedSwitch++;
                snprintf(profileMsg,sizeof(profileMsg),
                         "queue reject host=%.100s page=%.255s level=%u type=%d asset=%d robots=%d limits=%d depth=%d switch=%d",
                         hst.Host,hst.Page,level,hst.type,isAssetCandidate,blockedRobot,blockedLimits,blockedDepth,blockedSwitch);
                CRAWLER_PROFILE_LOG(profileMsg);
                snprintf(jsonMsg,sizeof(jsonMsg),
                         "{\"event\":\"queue\",\"decision\":\"reject\",\"reason\":\"policy\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d,\"robots\":%d,\"limits\":%d,\"depth\":%d,\"switch\":%d}",
                         jHost,jPage,jFrom,level,hst.type,isAssetCandidate,blockedRobot,blockedLimits,blockedDepth,blockedSwitch);
                CRAWLER_PROFILE_JSONL(jsonMsg);
                return -1;
            }
            if(isAssetCandidate &&
               ((EXTRA_LIMITS.nMaxDepthLevel == 0 && CRAWLER_LIMITS.nMaxDepthLevel>0 && level >= CRAWLER_LIMITS.nMaxDepthLevel) ||
                (EXTRA_LIMITS.nMaxDepthLevel>0 && level >= EXTRA_LIMITS.nMaxDepthLevel)))
            {
                gProfileAssetDepthBypass++;
            snprintf(profileMsg,sizeof(profileMsg),
                     "queue depth_bypass_asset host=%.100s page=%.255s level=%u type=%d",
                     hst.Host,hst.Page,level,hst.type);
            CRAWLER_PROFILE_LOG(profileMsg);
            snprintf(jsonMsg,sizeof(jsonMsg),
                     "{\"event\":\"queue\",\"decision\":\"depth_bypass\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d}",
                     jHost,jPage,jFrom,level,hst.type,isAssetCandidate);
            CRAWLER_PROFILE_JSONL(jsonMsg);
        }
        }
		
		if(OwsQueueInMemoryIfNew(hst, level)!=1)	//Host is not in list
        {
            gProfileRejectedDuplicate++;
            snprintf(profileMsg,sizeof(profileMsg),
                     "queue reject duplicate host=%.100s page=%.255s level=%u type=%d asset=%d",
                     hst.Host,hst.Page,level,hst.type,isAssetCandidate);
            CRAWLER_PROFILE_LOG(profileMsg);
            snprintf(jsonMsg,sizeof(jsonMsg),
                     "{\"event\":\"queue\",\"decision\":\"reject\",\"reason\":\"duplicate\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d}",
                     jHost,jPage,jFrom,level,hst.type,isAssetCandidate);
            CRAWLER_PROFILE_JSONL(jsonMsg);
			return -1;
        }
        gProfileAcceptedTotal++;
        OwsProfileRecordQueuedType(&hst);
        snprintf(profileMsg,sizeof(profileMsg),
                 "queue accept host=%.100s page=%.255s level=%u type=%d asset=%d",
                 hst.Host,hst.Page,level,hst.type,isAssetCandidate);
        CRAWLER_PROFILE_LOG(profileMsg);
        snprintf(jsonMsg,sizeof(jsonMsg),
                 "{\"event\":\"queue\",\"decision\":\"accept\",\"reason\":\"normal\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"from\":\"%.100s\",\"level\":%u,\"type\":%d,\"asset\":%d,\"dynamic\":%d}",
                 jHost,jPage,jFrom,level,hst.type,isAssetCandidate,OwsPageLooksLikeDynamicAsset(hst.Page));
        CRAWLER_PROFILE_JSONL(jsonMsg);
	}
	
	return 1;
}

int AddExternalHost(struct sHost Host,struct sHost* from)
{
	char* sqlQuery;
	char sError[MAXHOSTSIZE+50];
	
	if(bTesting==1 || bAddExternalHost==1)
		return 1;
	
	if(iQuit==1 || bKillThread==1)
		return 1;
	
	if(strchr(Host.Host ,' ')>Host.Host)
	{
		sprintf(sError,"AddExternalHost(): Found wrong url: %s",Host.Host);
		printf("\r\n %s \r\n",sError);
		ERROR_LOG(sError);
		
		thrdUnBlock(BLOCKEXH);
		return -1;
	}
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
    snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO hostlist (hostname,port,status) VALUES('%s',%d, 0);",Host.Host, Host.port);
	my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);
	
	FREE(sqlQuery);


	/* "from" could be NULL (ows server switch to an input-defined host) */
	if(from)
		pRelationships(from,&Host,nRelationships);

	return 1;
}


/* GetDir
* Page -> dir <-
* Page = "/dir1/dir2/page.htm" => dir = "/dir1/dir2/"
*/
int GetDir(char* Page,char* dir)
{
int i;
int last=0;
char* tmpPage;
char* tmpP;

    tmpPage = malloc(strlen(Page)+5);

    strcpy(tmpPage, Page);
    
    tmpP = strchr(tmpPage,'?');

    if( tmpP > tmpPage)
        tmpPage[tmpP-tmpPage]=0;
	
    for(i=0;i<(signed)strlen(tmpPage);i++)
		if(tmpPage[i]=='/')
			last=i;
        
	strncpy(dir,(last==0) ? "/" : tmpPage,(last==0) ? 1 : last);
	dir[(last==0) ? 1 : last]=0;
	
	if(dir[strlen(dir)-1]!='/')
		strcat(dir,"/");
	
    FREE(tmpPage);

return 1;
}


/* Versione safe di CheckPage: niente strcat su buffer sovrapposti */
int CheckPage(struct sHost *Host)
{
    char work[MAXPAGESIZE];
    size_t len;
    char *p;

    if (Host == NULL)
        return 0;

    /* Copia sicura della pagina */
    strncpy(work, Host->Page, MAXPAGESIZE - 1);
    work[MAXPAGESIZE - 1] = '\0';

    /* 1) Taglia la parte di fragment (#...) */
    p = strchr(work, '#');
    if (p != NULL)
        *p = '\0';

    /* 2) Normalizza spazi finali */
    len = strlen(work);
    while (len > 0 && (work[len - 1] == ' ' ||
                       work[len - 1] == '\t' ||
                       work[len - 1] == '\r' ||
                       work[len - 1] == '\n'))
    {
        work[--len] = '\0';
    }

    /* 3) Se è vuoto, la pagina non è valida */
    if (len == 0)
        return 0;

    /* 4) Assicura che inizi con '/' (se ti serve questa logica) */
    if (work[0] != '/')
    {
        char fixed[MAXPAGESIZE];
        fixed[0] = '/';
        fixed[1] = '\0';

        /* strncat su *buffer diverso*, quindi nessuna overlap */
        strncat(fixed, work, MAXPAGESIZE - 2);
        fixed[MAXPAGESIZE - 1] = '\0';

        /* Copia il risultato finale dentro Host->Page */
        strncpy(Host->Page, fixed, MAXPAGESIZE - 1);
        Host->Page[MAXPAGESIZE - 1] = '\0';
    }
    else
    {
        /* Niente overlap: copia semplice */
        strncpy(Host->Page, work, MAXPAGESIZE - 1);
        Host->Page[MAXPAGESIZE - 1] = '\0';
    }

    return 1;
}


/* PageType
* Host <-
* Host->Page = "/test.htm" Host->type = 1 (type htm/html)
*/
int PageType(struct sHost* Host)
{
	int i;
	char rPage[MAXPAGESIZE];
	int bArgs=0;		/*bArgs=1 == the page contains a '?'*/
	int slHP;
	int rlen, elen;
	
	if(Host==NULL)
		return -1;
	
	memset(rPage,0,MAXPAGESIZE);
	
	strncpy(rPage,Host->Page,MAXPAGESIZE-1);
	
	if(strchr(rPage,'?')>rPage)				//does this page contain a '?'
	{
		rPage[strchr(rPage,'?')-rPage]=0;	//cut it
		bArgs=1;
	}
	if(strchr(rPage,'#')>rPage)				//does this page contain a fragment?
	{
		rPage[strchr(rPage,'#')-rPage]=0;	//cut it
		bArgs=1;
	}
	
	if(Host->Page[strlen(Host->Page)-1]=='/')
	{
		Host->type = 1;			// Html file
		return 1;
	}
	
	slHP = MIN(strlen(Host->Page),MAXPAGESIZE);
	for(i=0;i<slHP;i++)
	{
		if(Host->Page[i]=='.')
			break;
	}
	
	if(i==(signed)strlen(Host->Page) && bArgs==0)    //Maybe a directory (no '.' found)
    {
        if (OwsPageLooksLikeDynamicAsset(Host->Page))
        {
            Host->type = bAggressiveIndexMode ? 4 : 1;
            return 1;
        }
		if(strlen(Host->Page)>=MAXPAGESIZE-1)
			return -1;
		
		strcat(Host->Page,"/");
		Host->type = 1;            // Html file
		return 1;
	}
	
	for(i=0; HtmlExtensions[i][0] != 0; i++)
{
    rlen = strlen(rPage);
    elen = strlen(HtmlExtensions[i]);

    if (rlen >= elen &&
        stricmp(rPage + (rlen - elen), (char*)HtmlExtensions[i]) == 0)
    {
        Host->type = 1;        // Html file
        return 1;
    }
}

	
	for(i=0; PlainTextExtension[i][0] != 0; i++)
{
    rlen = strlen(rPage);
    elen = strlen(PlainTextExtension[i]);

    if (rlen >= elen &&
        stricmp(rPage + (rlen - elen), (char*)PlainTextExtension[i]) == 0)
    {
        Host->type = 2;
        return 1;
    }
}

	
	/* Support for custom extensions (e.g. pdf, swf).
	   Accept both "...file.ext" and "...file.ext?query=..." */
	for(i=0; CustomExtensions[i][0] != 0; i++)
	{
		char *qmark;
		unsigned int baseLen;

		rlen = strlen(rPage);
		elen = strlen(CustomExtensions[i]);
		qmark = strchr(rPage, '?');
		baseLen = qmark ? (unsigned int)(qmark - rPage) : rlen;

		if (baseLen >= elen &&
			stricmp(rPage + (baseLen - elen), (char*)CustomExtensions[i]) == 0)
		{
			Host->type = 4;
			return 1;
		}
	}

	
	if(bArgs==1)
		Host->type = (bAggressiveIndexMode && OwsPageLooksLikeDynamicAsset(Host->Page)) ? 4 : 1;
	else
		Host->type = bAggressiveIndexMode ? 4 : 3;            //discard it
	
	return 1;
}

/* PortNumFromHostname
* hostname -><-
* hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90;
*/
unsigned int PortNumFromHostname(char* hostname)
{
    unsigned int i;
    unsigned int len;

    if (!hostname)
        return PORT;

    len = (unsigned int)strlen(hostname);

    for (i = 0; i < len; i++)
    {
        if (hostname[i] == ':')
            break;
    }

    if (i != len)
    {
        hostname[i] = 0;
        return atoi(&hostname[i+1]);
    }

    return PORT;
}


int GenerateURL(struct sHost Host,char* URL)
{
    char port[8];
    const char *scheme;

    if (Host.isSSL || Host.port == 443)
        scheme = "https://";
    else
        scheme = "http://";

    sprintf(port, "%d", Host.port);

    strcpy(URL, scheme);
    strcat(URL, Host.Host);

    if (Host.isSSL) {
        if (Host.port != 443) {
            strcat(URL, ":");
            strcat(URL, port);
        }
    } else {
        if (Host.port != PORT) {
            strcat(URL, ":");
            strcat(URL, port);
        }
    }

    strcat(URL, Host.Page);

    return 1;
}




/* ParseUrl
* Url <- sHost
* Url: "http://www.test.com/page.htm" ==>
*	==> sHost.Url = Url &&  sHost.Host = "www.test.com" &&  sHost.Page = "page.htm"
*/
int ParseUrl(char *url, struct sHost *sh, struct sHost *currentHost)
{
    char tUrl[MAXURLSIZE];
    char BaseDir[MAXPAGESIZE];
    unsigned int offset = 0, i;
    char *token1 = NULL;
    char *tmpPage;

    if (url == NULL || sh == NULL)
        return -1;

    /* scarta pseudo-URL non web (anche nel formato "/tel:..." ) */
    {
        char *u = url;
        while (*u == ' ' || *u == '\t' || *u == '\r' || *u == '\n')
            u++;
        if (*u == '/')
            u++;

        if (strnicmp(u, "ftp://",      6) == 0 ||
            strnicmp(u, "mailto:",     7) == 0 ||
            strnicmp(u, "about:",      6) == 0 ||
            strnicmp(u, "irc://",      6) == 0 ||
            strnicmp(u, "news://",     7) == 0 ||
            strnicmp(u, "tel:",        4) == 0 ||
            strnicmp(u, "javascript:",11) == 0 ||
            strnicmp(u, "vbscript:",   9) == 0 ||
            strnicmp(u, "sms:",        4) == 0 ||
            strnicmp(u, "callto:",     7) == 0 ||
            strnicmp(u, "skype:",      6) == 0 ||
            strnicmp(u, "geo:",        4) == 0 ||
            strnicmp(u, "fax:",        4) == 0 ||
            strnicmp(u, "facetime:",   9) == 0 ||
            strnicmp(u, "data:",       5) == 0 ||
            strnicmp(u, "intent:",     7) == 0)
        {
            return -1;
        }
    }

    if (strlen(url) > MAXURLSIZE - 1)
        return -1;

    /* azzera la struct host */
    memset(sh, 0, sizeof(struct sHost));
    sh->isSSL = 0;

    /* rimuovi il fragment (#...) in-place */
    for (i = 0; i < strlen(url); i++) {
        if (url[i] == '#') {
            url[i] = 0;
            break;
        }
    }

    if (url[0] == 0)
        return -1;

    /* rilevamento schema */
    if (strnicmp(url, "http://", 7) == 0) {
        if (strlen(url) == 7)
            return -1;
        offset = 7;
        sh->isSSL = 0;
    } else if (strnicmp(url, "https://", 8) == 0) {
        if (strlen(url) == 8)
            return -1;
        offset = 8;
        sh->isSSL = 1;
    } else if (strncmp(url, "//", 2) == 0) {
        /* schema-relative: //host/... → ereditiamo schema da currentHost se possibile */
        if (strlen(url) == 2)
            return -1;
        offset = 2;
        if (currentHost && currentHost->isSSL)
            sh->isSSL = 1;
    }

    memset(tUrl, 0, MAXURLSIZE);

    strncpy(tUrl, url + offset, strlen(url) - offset);
    tUrl[strlen(url) - offset] = 0;

    /* URL assoluto: aveva schema o //host/... */
    if (offset > 0)
    {
        for (i = 0; i < strlen(tUrl); i++) {
            if (tUrl[i] == '/' || tUrl[i] == '?') {
                token1 = tUrl + i;
                break;
            }
        }

        if (token1 > tUrl) {
            /* host + page */
            strncpy(sh->Host, tUrl, MIN((int)(token1 - tUrl), MAXHOSTSIZE - 1));
            sh->Host[MAXHOSTSIZE - 1] = '\0';
            strncpy(sh->Page, token1, MAXPAGESIZE - 1);
            sh->Page[MAXPAGESIZE - 1] = '\0';

            if (strnicmp(sh->Page, "mailto:", 7) == 0)
                return -1;
        } else {
            /* nessun '/', solo host → page="/" */
            strncpy(sh->Host, tUrl, MAXHOSTSIZE - 1);
            sh->Host[MAXHOSTSIZE - 1] = '\0';
            strcpy(sh->Page, "/");
        }

        /* Porta da hostname (es. www.site.it:8080) oppure default */
        sh->port = PortNumFromHostname(sh->Host);

        if (sh->port == PORT) {
            /* se schema HTTPS, porta di default 443 */
            if (sh->isSSL)
                sh->port = 443;
        }

        /* se porta è 443 e non avevamo schema, segna SSL */
        if (sh->port == 443)
            sh->isSSL = 1;

        strtrim(sh->Host, sh->Host);

        tmpPage = malloc(MAXPAGESIZE);
        if (!tmpPage)
            MemoryCorruptedHandler("ParseUrl(tmpPage)");

        strtrim(sh->Page, tmpPage);
        ReplaceStr(tmpPage, sh->Page, "&amp;", "&");
        FREE(tmpPage);

        /* se è lo stesso host/porta dell'host corrente, eredita host_id */
        if (currentHost &&
            currentHost->host_id != 0 &&
            strcmp(currentHost->Host, sh->Host) == 0 &&
            currentHost->port == sh->port)
        {
            sh->host_id = currentHost->host_id;
        }

        if (CheckPage(sh) == -1)
            return -1;

        return PageType(sh);
    }
    else
    {
        /* URL relativa */
        if (strlen(url) > MAXPAGESIZE - 1)
            return -1;

        if (currentHost == NULL)
            return -1;

        strncpy(sh->Host, currentHost->Host, MIN(MAXHOSTSIZE - 1, strlen(currentHost->Host)));
        sh->Host[MAXHOSTSIZE - 1] = '\0';

        /* base directory */
        if (tUrl[0] != '/')
            GetDir(currentHost->Page, BaseDir);
        else
            BaseDir[0] = 0;

        /* controlliamo se tUrl rappresenta una pagina con estensione */
        for (i = strlen(tUrl); i > 0; i--) {
            if (tUrl[i] == '/')
                break;
            else if (tUrl[i] == '.') {
                /* sembra una pagina (es. "dir/page.php") */
                if (strlen(BaseDir) + strlen(tUrl) >= MAXPAGESIZE)
                    return -1;

                strcpy(sh->Page, BaseDir);
                strcat(sh->Page, tUrl);

                /* eredita porta e SSL dall'host corrente */
                sh->port  = currentHost->port;
                sh->isSSL = currentHost->isSSL;

                strtrim(sh->Host, sh->Host);

                tmpPage = malloc(MAXPAGESIZE);
                if (!tmpPage)
                    MemoryCorruptedHandler("ParseUrl(tmpPage2)");

                strtrim(sh->Page, tmpPage);
                ReplaceStr(tmpPage, sh->Page, "&amp;", "&");
                FREE(tmpPage);

                if (currentHost->host_id != 0)
                    sh->host_id = currentHost->host_id;

                if (CheckPage(sh) == -1)
                    return -1;
                return PageType(sh);
            }
        }

        /* directory o pseudo-directory */
        if ((unsigned)i == strlen(tUrl) - 1)
        {
            /* tipo "dir1/dir2/" o "dir1/dir2?x=y" */
            if (strlen(BaseDir) + strlen(tUrl) >= MAXPAGESIZE)
                return -1;

            strcpy(sh->Page, BaseDir);
            strcat(sh->Page, tUrl);

            if (strchr(tUrl, '?') == NULL && !OwsPageLooksLikeDynamicAsset(tUrl))
                strcat(sh->Page, "/");

            sh->port  = currentHost->port;
            sh->isSSL = currentHost->isSSL;

            strtrim(sh->Host, sh->Host);

            tmpPage = malloc(MAXPAGESIZE);
            if (!tmpPage)
                MemoryCorruptedHandler("ParseUrl(tmpPage3)");

            strtrim(sh->Page, tmpPage);
            ReplaceStr(tmpPage, sh->Page, "&amp;", "&");
            FREE(tmpPage);

            if (currentHost->host_id != 0)
                sh->host_id = currentHost->host_id;

            if (CheckPage(sh) == -1)
                return -1;
            return PageType(sh);
        }
        else
        {
            /* caso "dir1/qualcosa" che consideriamo directory */
            if (strlen(BaseDir) + strlen(tUrl) + 1 >= MAXPAGESIZE)
                return -1;

            strcpy(sh->Page, BaseDir);
            strcat(sh->Page, tUrl);

            if (strchr(tUrl, '?') == NULL && !OwsPageLooksLikeDynamicAsset(tUrl))
                strcat(sh->Page, "/");

            sh->port  = currentHost->port;
            sh->isSSL = currentHost->isSSL;

            strtrim(sh->Host, sh->Host);

            tmpPage = malloc(MAXPAGESIZE);
            if (!tmpPage)
                MemoryCorruptedHandler("ParseUrl(tmpPage4)");

            strtrim(sh->Page, tmpPage);
            ReplaceStr(tmpPage, sh->Page, "&amp;", "&");
            FREE(tmpPage);

            if (currentHost->host_id != 0)
                sh->host_id = currentHost->host_id;

            if (CheckPage(sh) == -1)
                return -1;
            return PageType(sh);
        }
    }
}


/* GetHostId
*  if the host exists in the table hostlist returns its id
*  else returns 0
*/
int GetHostId(struct sHost host)
{
    char       *sqlQuery = NULL;
    MYSQL_RES   gRes;          /* solo per compatibilità, non usato direttamente */
    MYSQL_RES **tmpRes  = NULL;
    MYSQL_RES  *res     = NULL;
    MYSQL_ROW   row     = NULL;
    unsigned int ret    = 0;
    int qret;

    tmpRes = (MYSQL_RES **)malloc(sizeof(MYSQL_RES *));
    if (tmpRes == NULL)
        MemoryCorruptedHandler("GetHostId");

    *tmpRes = NULL;

    sqlQuery = malloc(MAXQUERYSIZE);
    if (sqlQuery == NULL)
        MemoryCorruptedHandler("GetHostId");

    snprintf_mysql_escaped_sql_statement(
        &gMysqlDB1,
        sqlQuery,
        MAXQUERYSIZE - 1,
        "SELECT id FROM %s.hostlist WHERE hostname='%s' AND port = %d LIMIT 1",
        DB1,
        host.Host,
        host.port
    );

    qret = my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery, tmpRes, &gRes, BLOCKDB1);

    FREE(sqlQuery);

    /* Errore nella query oppure nessun result set valido */
    if (qret != 0 || tmpRes == NULL || *tmpRes == NULL)
    {
        ret = 0;
        goto cleanup;
    }

    res = *tmpRes;
    row = mysql_fetch_row(res);

    if (row && row[0])
        ret = (unsigned int)atoi(row[0]);
    else
        ret = 0;

cleanup:
    if (res)
        mysql_free_result(res);

    if (tmpRes)
        FREE(tmpRes);

    return ret;
}




int pRelationships(struct sHost* links, struct sHost* linked, int level)
{
    char* sqlQuery;
    int host_id;
    int linkedhost_id;

    if (bTesting == 1)
        return 1;

    /* Se DB2 è in errore fatale, non possiamo più recuperare gli host_id:
       saltiamo la scrittura delle relazioni. */
    if (gDb2FatalError)
        return 0;

    if (level > 0 && level < 3)
    {
        if( ((links ? (*links) : IndexingHost).host_id) == 0 )
            host_id = GetHostId( (links ? *links : IndexingHost) );
        else
            host_id = ((links ? (*links) : IndexingHost).host_id);

        if (linked->host_id == 0)
            linkedhost_id = GetHostId(*linked);
        else
            linkedhost_id = linked->host_id;

        if (host_id == 0 || linkedhost_id == 0)
            return 0;

        sqlQuery = malloc(MAXQUERYSIZE);

        if (level == 1)    // saves hostname only
            snprintf_mysql_escaped_sql_statement(
                &gMysqlDB1,
                sqlQuery,
                MAXQUERYSIZE - 1,
                "INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) "
                "VALUES(%d,%d,'/','/', '%s')",
                host_id,
                linkedhost_id,
                linked->Description
            );
        else if (level == 2)
            snprintf_mysql_escaped_sql_statement(
                &gMysqlDB1,
                sqlQuery,
                MAXQUERYSIZE - 1,
                "INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) "
                "VALUES(%d,%d,'%s','%s', '%s')",
                host_id,
                linkedhost_id,
                (links ? links->Page : "/"),
                linked->Page,
                linked->Description
            );

        my_mysql_query(&gMysqlDB1, sqlQuery, BLOCKDB1);
        FREE(sqlQuery);

        return 1;
    }
    else
        return 0;
}



/* unencode
* transform the gave unicoded string in an unencoded string
*/
void unencode(char *src, char *last, char *dest)
{
	if (!src || !dest)
		return;

	/* treat 'last' as an exclusive bound; also stop at '\0' */
	for(; src < last && *src != '\0'; src++, dest++)
	{
		if(*src == '%') 
		{
			int code;
			if((src + 2) < last && sscanf(src+1, "%2x", &code) == 1) 
			{
				*dest = (char)code;
				src +=2;
			}
			else
				code = '?';
			*dest = (char)code;
		}
		else
			*dest = *src;
	}
	
	*dest = '\0';
	
	return;
}


#endif

/*EOF*/
