
/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.8
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __HTMLFNCT
#define __HTMLFNCT
#include <string.h>


/* ForgePacket
 * hst -> packet <-
 * hst.Page = "/prova.htm" ==> packet = "GET /prova.htm HTTP/1.1"
 */
int ForgeHTTPPacket(struct sHost hst,char * packet, size_t packetLen)
{
    char unicodedFilename[MAXURLSIZE];
    size_t g=0;
    size_t i, pageLen;
    int wrote;

    if (!packet || packetLen == 0)
        return 0;

    unicodedFilename[0] = '\0';
    pageLen = strlen(hst.Page);

    for(i=0; i < pageLen; i++)
    {
        if(hst.Page[i]==' ')
        {
            if (g + 3 >= sizeof(unicodedFilename))
                break;
            unicodedFilename[g++] = '%';
            unicodedFilename[g++] = '2';
            unicodedFilename[g++] = '0';
            unicodedFilename[g] = '\0';
        }
        else
        {
            if (g + 1 >= sizeof(unicodedFilename))
                break;
            unicodedFilename[g++]=hst.Page[i];
            unicodedFilename[g]=0;
        }
    }

    wrote = snprintf(packet, packetLen,
	                     "GET %s HTTP/1.1\r\n"
	                     "Host: %s\r\n"
	                     "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OpenWebSpider/0.8\r\n"
	                     "Accept: text/html,application/xhtml+xml,application/xml;q=0.95,application/json;q=0.95,text/css;q=0.9,application/javascript;q=0.9,text/javascript;q=0.9,application/pdf;q=0.9,image/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.9,video/*;q=0.9,audio/*;q=0.9,application/octet-stream;q=0.8,*/*;q=0.7\r\n"
                     "Accept-Language: en-US,en;q=0.8,it;q=0.7,*;q=0.5\r\n"
                     "Accept-Encoding: identity\r\n"
                     "Connection: close\r\n"
                     "\r\n",
                     unicodedFilename,
                     hst.Host);
    if (wrote < 0 || (size_t)wrote >= packetLen)
        return 0;

    return 1;
}


/* ParseHTTPRequest
 *  - recvdpkt   : buffer con la risposta HTTP completa
 *  - htmlOut    : buffer dove mettere il body
 *  - maxout     : dimensione di htmlOut
 *  - httpHeader : prima riga dell'header (es. "HTTP/1.1 200 OK")
 *  - stuff      : usato per restituire l'header Location: (redirect)
 *  - level      : livello di profondità (non usato qui)
 *
 * Ritorna:
 *  0 = errore / risposta non HTTP
 *  1 = risposta valida NON 200 (es. 301/302/404/500…)
 *  2 = risposta 200 OK
 */
int ParseHTTPRequest(char* recvdpkt,
                     unsigned int recvLen,
                     char* htmlOut,
                     unsigned int maxout,
                     char* httpHeader,
                     char* stuff,
                     int   level,
                     unsigned int* bodyLen)
{
    unsigned int c;
    int   loc = 0;
    char* sLocation = stuff;
    /* struct sHost locHost; */

    (void)level;

    /* Non è una risposta HTTP -> esco */
    if (recvdpkt == NULL || recvLen <= 0 || htmlOut == NULL || maxout <= 1)
        return 0;

    if (recvLen < 6 || strnicmp(recvdpkt,"HTTP/1",6) != 0)
        return 0;

    memset(httpHeader, 0, MAXHTTPSTATUSSIZE);
    memset(sLocation,  0, MAXHOSTSIZE);
    htmlOut[0] = '\0';
    if (bodyLen)
        *bodyLen = 0;

    /* Prima riga: "HTTP/1.x xxx ..." */
    for (c = 0; c + 1 < recvLen; c++)
    {
        if (recvdpkt[c] == '\r' && recvdpkt[c + 1] == '\n')
        {
            int firstLineLen = c;
            if (firstLineLen >= MAXHTTPSTATUSSIZE)
                firstLineLen = MAXHTTPSTATUSSIZE - 1;
            memcpy(httpHeader, recvdpkt, firstLineLen);
            httpHeader[firstLineLen] = '\0';
            break;
        }
    }

    /* Dobbiamo cercare Location: solo se è un redirect 301/302 */
    if ( strnicmp(httpHeader,"HTTP/1.1 302",12) == 0 ||
         strnicmp(httpHeader,"HTTP/1.0 302",12) == 0 ||
         strnicmp(httpHeader,"HTTP/1.1 301",12) == 0 ||
         strnicmp(httpHeader,"HTTP/1.0 301",12) == 0 )
    {
        loc = 1;
    }

    /* Scansione dell'header */
    for (c = 0; c + 3 < recvLen; c++)
    {
        /* Estrai l'header Location: <url> */
        if (loc == 1 && (c + 8) < recvLen && strncmp(recvdpkt + c, "Location:", 9) == 0)
        {
            char *start = recvdpkt + c + 9; /* salta "Location:" */
            char *end;
            size_t len;
            char *bufEnd = recvdpkt + recvLen;

            while (start < bufEnd && (*start == ' ' || *start == '\t'))
                start++;

            end = start;
            while ((end + 1) < bufEnd)
            {
                if (end[0] == '\r' && end[1] == '\n')
                    break;
                end++;
            }

            if ((end + 1) < bufEnd)
            {
                len = (size_t)(end - start);
                if (len < MAXURLSIZE)
                {
                    strncpy(sLocation, start, len);
                    sLocation[len] = '\0';
                }
                else
                {
                    strcpy(sLocation, "<Url too long>");
                }
            }
        }

        /* Fine header: \r\n\r\n -> copia il body in htmlOut */
        if (strncmp(recvdpkt + c, "\r\n\r\n", 4) == 0)
        {
            unsigned int copyLen = recvLen - (c + 4);
            if (copyLen > maxout - 1)
                copyLen = maxout - 1;

            if (copyLen > 0)
                memcpy(htmlOut, recvdpkt + c + 4, copyLen);
            htmlOut[copyLen] = '\0';
            if (bodyLen)
                *bodyLen = copyLen;

            /* 2 = index solo 200 OK, 1 = gli altri status */
            if (strnicmp(httpHeader,"HTTP/1.1 200",12) == 0 ||
                strnicmp(httpHeader,"HTTP/1.0 200",12) == 0)
                return 2;
            else
                return 1;
        }
    }

    return 0;
}
char* RemoveTag(char* html, char* startTag, char* endTag)
{
char* startTagFound;
char* endTagFound;

/* <!-- remove html comments--> (Marius Roibu) */
startTagFound = my_stristr(html, startTag);
if (startTagFound)
    endTagFound = my_stristr(startTagFound, endTag);

/* if startTagFound<endTagFound -> HTML is wrong */
while ((startTagFound != NULL) && (endTagFound != NULL) && startTagFound < endTagFound)
{
    /* porta endTagFound subito dopo la chiusura del tag */
    endTagFound += strlen(endTag);

    /* al posto dell'inizio del tag mettiamo uno spazio */
    *startTagFound = ' ';

    /* spostiamo la coda della stringa (dopo il tag) subito dopo lo spazio */
    memmove(startTagFound + 1,        /* destinazione */
            endTagFound,              /* sorgente */
            strlen(endTagFound) + 1); /* +1 per il terminatore '\0' */

    /* cerca il prossimo tag da rimuovere */
    startTagFound = my_stristr(html, startTag);
    if (startTagFound)
        endTagFound = my_stristr(startTagFound, endTag);
    else
        endTagFound = NULL;
}

return html;
}

/* BetweenTag
 * html text -> tag -> maxout out <-
 * html: "<p align=center>bye bye</p> ==> "align=center>ciao ciao"
 */
int BetweenTag(char* html, char* tag,char* out,int endwithstarttag,int maxout)
{
char tmptag1[MAXTAGSIZE+1], tmptag2[MAXTAGSIZE+3];
int iRelPos=0;
char* tmpPacket;
char* startTag;
char* tmpP;

	sprintf(tmptag1,"<%s",tag);
	sprintf(tmptag2,"</%s>",tag);

	tmpPacket = malloc(MAXPACKETBUFSIZE);

	if(tmpPacket==NULL)
		MemoryCorruptedHandler("BetweenTag");

	atoupper(html,tmpPacket,MAXPACKETBUFSIZE-1);

	do
	{
		if((startTag=my_stristr(tmpPacket+iRelPos,_strupr(tmptag1)))<tmpPacket)
		{
			FREE(tmpPacket);
			return -1;
		}

		iRelPos=(startTag-tmpPacket)+strlen(tmptag1);

		//loop untile the tag is followed by a char that's not ' ' or '>' or CRLF or a tab
	/*	tmpPacket[iRelPos]!=' ' && tmpPacket[iRelPos]!='>' && tmpPacket[iRelPos]!='\r' && tmpPacket[iRelPos]!='\n' && tmpPacket[iRelPos]!='\t')
			return BetweenTag(startTag+strlen(tmptag1)+1, tag, out,endwithstarttag,maxout);
	*/
	}while(tmpPacket[iRelPos]!=' ' && tmpPacket[iRelPos]!='>' && tmpPacket[iRelPos]!='\r' && tmpPacket[iRelPos]!='\n' && tmpPacket[iRelPos]!='\t');

	if(endwithstarttag==1)					//Ex. <A href=sample.c>sample code</A>
		tmpP=my_stristr(tmpPacket+iRelPos,_strupr(tmptag2));
	else							//Ex. <IMG src=sample.jpg>
		tmpP=strchr(tmpPacket+iRelPos,'>');

	if(tmpP>tmpPacket+iRelPos && tmpP-(tmpPacket+iRelPos)<maxout)
	{
		strncpy(out,html+iRelPos,tmpP-(tmpPacket+iRelPos));
		out[tmpP-(tmpPacket+iRelPos)]=0;
	
		FREE(tmpPacket);

	return iRelPos+1;
	}

	out[0]=0;

	FREE(tmpPacket);

return -1;
}

/* UnHtml
 * html -> text <- 
 * html = "<p align="left"><font face="Arial" size="2">TesT123</font></p>"
 * text => TesT123
 */
int UnHtml(char* html, char* text,int maxout)
{
int i, m, x=0, pOpen=0;
unsigned char curC;

	if(!html || !text || maxout <= 0)
		return 0;

	memset(text,0,maxout);
	if(maxout <= 1)
		return 0;

    RemoveTag(html,"<!--","-->");
    RemoveTag(html,"<script","</script>");
    RemoveTag(html,"<style","</style>");

	m=MIN((signed)strlen(html),maxout-1);


	for(i=0;i<m;i++)
	{
		curC=html[i];

		if(curC=='<')
		{
			pOpen=1;
			
			/* "a<br>b"  => "a b";
			   "a <br>b" => "a b" */
			if(x && text[x-1]!=' ' && x < maxout-1)
				text[x++]=' ';
		}
		else
		if(curC=='>')
			pOpen=0;

		if(pOpen==0 && curC!='>')
		{
			/*                     RemoveShit                  */
			if( /*!( (curC>=32 && curC<=126) || (curC>=192 && curC<=255) ) ||*/ curC=='\''  || curC=='\"'  || curC=='\\' || curC=='\n' || curC=='\r' || curC=='\t')
				curC=' ';

			if(x && text[x-1]==' ' && curC==' ')	//if x>0 and last char is space and current char is space -> don't add this char
				continue;
			else if(x < maxout-1)
				text[x++]=curC;
			else
				break;
		}
	}

	text[x]='\0';
return x;
}

static const char *OwsExtractNextSrcsetUrl(const char *srcset, char *out, size_t outSize)
{
const char *p;
size_t i=0;

	if(!srcset || !out || outSize<2)
		return NULL;

	out[0]='\0';
	p=srcset;
	while(*p==',' || *p==' ' || *p=='\t' || *p=='\r' || *p=='\n')
		p++;
	if(*p=='\0')
		return NULL;

	while(*p && *p!=',' && !isspace((unsigned char)*p) && i+1<outSize)
		out[i++]=*p++;

	out[i]='\0';
	while(*p && *p!=',')
		p++;
	if(*p==',')
		p++;
	return out[0]!='\0' ? p : NULL;
}

static int OwsAssetUrlHasSuffix(const char *url)
{
static const char *exts[] = {
	".pdf",
	".jpg",".jpeg",".jpe",".jfif",".pjpeg",".png",".apng",".gif",".webp",".svg",
	".bmp",".ico",".cur",".tif",".tiff",".avif",".heic",".heif",".jp2",".j2k",".jpf",".jpx",".jxl",
	".mp4",".m4v",".webm",".ogv",".mov",".avi",".mkv",".flv",".wmv",".asf",
	".ts",".m2ts",".mpeg",".mpg",".mpe",".vob",".3gp",".3g2",".m3u8",".mpd",
	".mp3",".m4a",".m4b",".aac",".adts",".oga",".ogg",".opus",".wav",".flac",".wma",".weba",
	".aif",".aiff",".mid",".midi",
	".vtt",".srt","\0"
};
char clean[MAXURLSIZE];
const char *end;
size_t len;
int i;

	if(!url || !url[0])
		return 0;

	while(*url==' ' || *url=='\t' || *url=='\r' || *url=='\n' || *url=='\"' || *url=='\'')
		url++;
	if(strnicmp((char*)url,"data:",5)==0 || strnicmp((char*)url,"javascript:",11)==0 ||
	   strnicmp((char*)url,"mailto:",7)==0 || strnicmp((char*)url,"tel:",4)==0 ||
	   strnicmp((char*)url,"blob:",5)==0 || strnicmp((char*)url,"about:",6)==0)
		return 0;

	end=url;
	while(*end && *end!='?' && *end!='#' && *end!='\"' && *end!='\'' &&
	      *end!=')' && *end!='<' && *end!='>' && !isspace((unsigned char)*end))
		end++;
	len=(size_t)(end-url);
	if(len==0 || len>=sizeof(clean))
		return 0;
	memcpy(clean,url,len);
	clean[len]='\0';

		for(i=0; exts[i][0] != '\0'; i++)
		{
			size_t cl=strlen(clean), el=strlen(exts[i]);
			if(cl>=el && stricmp(clean + (cl-el), (char*)exts[i])==0)
				return 1;
		}
		for(i=0; CustomExtensions[i][0] != '\0'; i++)
		{
			size_t cl=strlen(clean), el=strlen(CustomExtensions[i]);
			if(cl>=el && stricmp(clean + (cl-el), (char*)CustomExtensions[i])==0)
				return 1;
		}
		if(bAggressiveIndexMode && strchr(clean,'.')!=NULL)
			return 1;
		return 0;
	}

static int OwsContainsCiLocal(const char *hay, const char *needle)
{
size_t i,j,hn,nn;
	if(!hay || !needle)
		return 0;
	hn=strlen(hay);
	nn=strlen(needle);
	if(nn==0 || hn<nn)
		return 0;
	for(i=0;i<=hn-nn;i++)
	{
		for(j=0;j<nn;j++)
			if(tolower((unsigned char)hay[i+j]) != tolower((unsigned char)needle[j]))
				break;
		if(j==nn)
			return 1;
	}
	return 0;
}

static int OwsAssetUrlLooksDynamic(const char *url)
{
const char *p;
int hasUrlShape=0;

	if(!url || !url[0])
		return 0;
	if(strnicmp((char*)url,"http://",7)==0 || strnicmp((char*)url,"https://",8)==0 ||
	   strnicmp((char*)url,"//",2)==0 || url[0]=='/' || strnicmp((char*)url,"./",2)==0 ||
	   strnicmp((char*)url,"../",3)==0)
		hasUrlShape=1;
	if(!hasUrlShape)
		return 0;

	p=url;
		if(OwsContainsCiLocal(p,"video") || OwsContainsCiLocal(p,"audio") ||
		   OwsContainsCiLocal(p,"image") || OwsContainsCiLocal(p,"media") ||
		   OwsContainsCiLocal(p,"pdf") || OwsContainsCiLocal(p,"download") ||
		   OwsContainsCiLocal(p,"file") || OwsContainsCiLocal(p,"asset") ||
		   OwsContainsCiLocal(p,"photo") || OwsContainsCiLocal(p,"picture") ||
		   OwsContainsCiLocal(p,"thumb") || OwsContainsCiLocal(p,"stream") ||
		   OwsContainsCiLocal(p,"document") || OwsContainsCiLocal(p,"attachment") ||
		   OwsContainsCiLocal(p,"cdn") || OwsContainsCiLocal(p,"static") ||
		   OwsContainsCiLocal(p,"upload") || OwsContainsCiLocal(p,"blob") ||
		   OwsContainsCiLocal(p,"playlist") || OwsContainsCiLocal(p,"embed") ||
		   OwsContainsCiLocal(p,"proxy") || OwsContainsCiLocal(p,"render") ||
		   OwsContainsCiLocal(p,"raw") || OwsContainsCiLocal(p,"original") ||
		   OwsContainsCiLocal(p,"format=") || OwsContainsCiLocal(p,"mime"))
			return 1;
		if(bAggressiveIndexMode && strchr(p,'?')!=NULL)
			return 1;
	return 0;
}

static int OwsAggressiveAssetUrlCandidate(const char *url)
{
const char *p;
int hasUrlShape=0;

	if(!url || !url[0])
		return 0;

	while(*url==' ' || *url=='\t' || *url=='\r' || *url=='\n' || *url=='\"' || *url=='\'')
		url++;

	if(strnicmp((char*)url,"data:",5)==0 || strnicmp((char*)url,"javascript:",11)==0 ||
	   strnicmp((char*)url,"mailto:",7)==0 || strnicmp((char*)url,"tel:",4)==0 ||
	   strnicmp((char*)url,"about:",6)==0 || strnicmp((char*)url,"#inter",6)==0 ||
	   strnicmp((char*)url,"#include",8)==0)
		return 0;

	if(strnicmp((char*)url,"http://",7)==0 || strnicmp((char*)url,"https://",8)==0 ||
	   strnicmp((char*)url,"//",2)==0 || url[0]=='/' ||
	   strnicmp((char*)url,"./",2)==0 || strnicmp((char*)url,"../",3)==0)
		hasUrlShape=1;
	else if(bAggressiveIndexMode && strchr(url,'/')!=NULL &&
	        (isalnum((unsigned char)url[0]) || url[0]=='_' || url[0]=='-' || url[0]=='.'))
		hasUrlShape=1;

	if(!hasUrlShape)
		return 0;

	for(p=url; *p; p++)
	{
		if(*p=='<' || *p=='>' || *p=='{' || *p=='}' || *p=='\\' ||
		   *p=='\r' || *p=='\n' || *p=='\t')
			return 0;
	}

	return 1;
}

static void OwsUrlEntityUnescape(char *s)
{
char *r,*w;
	if(!s)
		return;
	r=w=s;
	while(*r)
	{
		if(strnicmp(r,"&amp;",5)==0)
		{
			*w++='&';
			r+=5;
		}
		else if(strnicmp(r,"&#38;",5)==0)
		{
			*w++='&';
			r+=5;
		}
		else
			*w++=*r++;
	}
	*w='\0';
}

static void OwsJsonSlashUnescape(char *s)
{
char *r,*w;
	if(!s)
		return;
	r=w=s;
	while(*r)
	{
		if(r[0]=='\\' && r[1]=='/')
		{
			r++;
		}
		else if(r[0]=='\\' && r[1]=='u' && r[2]=='0' && r[3]=='0' && r[4]=='2' &&
		        (r[5]=='F' || r[5]=='f'))
		{
			*w++='/';
			r+=6;
			continue;
		}
		else if(r[0]=='\\' && r[1]=='u' && r[2]=='0' && r[3]=='0' && r[4]=='3' &&
		        (r[5]=='A' || r[5]=='a'))
		{
			*w++=':';
			r+=6;
			continue;
		}
		else if(r[0]=='\\' && r[1]=='u' && r[2]=='0' && r[3]=='0' && r[4]=='2' && r[5]=='6')
		{
			*w++='&';
			r+=6;
			continue;
		}
		*w++=*r++;
	}
	*w='\0';
}

static void OwsProfileLogCandidateReject(const char *source, const char *reason, const char *rawUrl, struct sHost *baseHost)
{
char profileMsg[1024];
char jsonMsg[1536];
char jSource[80], jReason[80], jRaw[512], jBase[160];

	OwsJsonEscapeCopy(source ? source : "asset", jSource, sizeof(jSource));
	OwsJsonEscapeCopy(reason ? reason : "unknown", jReason, sizeof(jReason));
	OwsJsonEscapeCopy(rawUrl ? rawUrl : "", jRaw, sizeof(jRaw));
	OwsJsonEscapeCopy((baseHost ? baseHost->Host : ""), jBase, sizeof(jBase));
	snprintf(profileMsg,sizeof(profileMsg),
	         "candidate reject source=%.60s reason=%.60s raw=%.255s base=%.100s",
	         source ? source : "asset",
	         reason ? reason : "unknown",
	         rawUrl ? rawUrl : "",
	         baseHost ? baseHost->Host : "");
	CRAWLER_PROFILE_LOG(profileMsg);
	snprintf(jsonMsg,sizeof(jsonMsg),
	         "{\"event\":\"candidate\",\"decision\":\"reject\",\"source\":\"%.60s\",\"reason\":\"%.60s\",\"raw\":\"%.255s\",\"base\":\"%.100s\"}",
	         jSource,jReason,jRaw,jBase);
	CRAWLER_PROFILE_JSONL(jsonMsg);
}

static int OwsQueueNestedUrlParameters(const char *rawUrl, struct sHost *baseHost, struct sHost *fromHost, unsigned int level);

static int OwsQueueAssetUrl(const char *rawUrl, struct sHost *baseHost, struct sHost *fromHost, unsigned int level, char *description, const char *source)
{
char trimurl[MAXURLSIZE];
char encodedurl[MAXURLSIZE];
struct sHost tmphst;
char profileMsg[1024];

	gProfileRawAssetCandidates++;
	if(!rawUrl || !rawUrl[0] || !baseHost)
	{
		gProfileCandidateRejectedEmpty++;
		OwsProfileLogCandidateReject(source, "empty_or_no_base", rawUrl, baseHost);
		return 0;
	}
	strncpy(trimurl, rawUrl, sizeof(trimurl)-1);
	trimurl[sizeof(trimurl)-1]='\0';
	OwsJsonSlashUnescape(trimurl);
	OwsUrlEntityUnescape(trimurl);
	strtrim(trimurl, trimurl);
	if(bAggressiveIndexMode)
		OwsQueueNestedUrlParameters(trimurl, baseHost, fromHost, level);
	if(!OwsAssetUrlHasSuffix(trimurl) &&
	   !OwsAssetUrlLooksDynamic(trimurl) &&
	   !(bAggressiveIndexMode && OwsAggressiveAssetUrlCandidate(trimurl)))
	{
		gProfileCandidateRejectedNotAsset++;
		OwsProfileLogCandidateReject(source, "not_asset_profile", trimurl, baseHost);
		return 0;
	}

	memset(encodedurl,0,sizeof(encodedurl));
	unencode(trimurl,trimurl+strlen(trimurl),encodedurl);
	if(ParseUrl(encodedurl,&tmphst,baseHost)==-1)
	{
		gProfileCandidateRejectedParse++;
		OwsProfileLogCandidateReject(source, "parse_failed", trimurl, baseHost);
		return 0;
	}
	if(bTokenIn(encodedurl,"<>\r\n\t\\",strlen(encodedurl))!=0)
	{
		gProfileCandidateRejectedToken++;
		OwsProfileLogCandidateReject(source, "unsafe_token", trimurl, baseHost);
		return 0;
	}
	if(tmphst.type == 3)
	{
		gProfileCandidateRejectedType++;
		OwsProfileLogCandidateReject(source, "discard_type", trimurl, baseHost);
		return 0;
	}

	tmphst.viewed = 0;
	if(description)
		memcpy(tmphst.Description,description,MAXDESCRIPTIONSIZE-1);
	if(AddUrl(tmphst,level,fromHost)==1)
	{
		snprintf(profileMsg,sizeof(profileMsg),
		         "discover source=%s host=%.100s page=%.255s base=%.100s",
		         source ? source : "asset", tmphst.Host, tmphst.Page, baseHost->Host);
		CRAWLER_PROFILE_LOG(profileMsg);
		return 1;
	}
	return 0;
}

static int OwsQueueNestedUrlParameters(const char *rawUrl, struct sHost *baseHost, struct sHost *fromHost, unsigned int level)
{
static const char *keys[] = {
	"url=","uri=","src=","href=","file=","path=","media=","asset=","download=",
	"downloadUrl=","contentUrl=","embedUrl=","thumbnailUrl=","poster=",
	"redirect=","target=","u=","q=",
	"\0"
};
char nested[MAXURLSIZE];
char decoded[MAXURLSIZE];
const char *p;
int found=0;
int i;
static int nestedDepth=0;

	if(!rawUrl || !baseHost || !fromHost)
		return 0;
	if(nestedDepth >= 2)
		return 0;

	nestedDepth++;
	for(i=0; keys[i][0] != '\0'; i++)
	{
		p = rawUrl;
		while((p = my_stristr((char*)p, (char*)keys[i])) != NULL)
		{
			const char *v = p + strlen(keys[i]);
			size_t n=0;

			while(*v && *v!='&' && *v!=';' && *v!='#' && *v!='\"' && *v!='\'' &&
			      *v!='<' && *v!='>' && !isspace((unsigned char)*v) && n+1<sizeof(nested))
				nested[n++]=*v++;
			nested[n]='\0';

			if(n>3)
			{
				memset(decoded,0,sizeof(decoded));
				unencode(nested,nested+strlen(nested),decoded);
				OwsJsonSlashUnescape(decoded);
				OwsUrlEntityUnescape(decoded);
				strtrim(decoded, decoded);
				if(OwsAggressiveAssetUrlCandidate(decoded) || OwsAssetUrlHasSuffix(decoded) || OwsAssetUrlLooksDynamic(decoded))
				{
					if(OwsQueueAssetUrl(decoded, baseHost, fromHost, level, NULL, "nested_param"))
						found++;
				}
			}
			p = v;
		}
	}

	nestedDepth--;
	return found;
}

static int OwsScanLooseAssetUrls(char *html, struct sHost *baseHost, struct sHost *fromHost)
{
char raw[MAXURLSIZE];
char *p;
int found=0;

	if(!html || !baseHost || !fromHost)
		return 0;

	for(p=html; *p; p++)
	{
		if(*p=='\"' || *p=='\'')
		{
			char quote=*p++;
			size_t n=0;
			while(*p && *p!=quote && n+1<sizeof(raw))
				raw[n++]=*p++;
			raw[n]='\0';
			if(OwsQueueAssetUrl(raw, baseHost, fromHost, fromHost->level, NULL, "quoted"))
			{
				gProfileSourceLoose++;
				found++;
			}
			if(!*p)
				break;
		}
	}

	p=html;
	while(*p)
	{
		char *q;
		size_t n=0;
		if(strnicmp(p,"url(",4)!=0)
		{
			p++;
			continue;
		}
		p += 4;
		while(*p && isspace((unsigned char)*p))
			p++;
		if(*p=='\"' || *p=='\'')
		{
			char quote=*p++;
			while(*p && *p!=quote && n+1<sizeof(raw))
				raw[n++]=*p++;
		}
		else
		{
			q=p;
			while(*q && *q!=')' && !isspace((unsigned char)*q) && n+1<sizeof(raw))
				raw[n++]=*q++;
			p=q;
		}
		raw[n]='\0';
		if(OwsQueueAssetUrl(raw, baseHost, fromHost, fromHost->level, NULL, "css_url"))
		{
			gProfileSourceLoose++;
			found++;
		}
	}

	return found;
}

static int OwsScanStructuredMediaFields(char *html, struct sHost *baseHost, struct sHost *fromHost)
{
static const char *keys[] = {
	"src","href","url","uri","path","file","fileUrl","download","downloadUrl",
	"contentUrl","embedUrl","thumbnailUrl","thumbnail","poster","posterUrl",
	"image","images","video","videos","audio","media","asset","assets",
	"source","sources","srcset","data-src","data-url","data-href",
	"original","originalUrl","raw","rawUrl","full","fullUrl","large","largeUrl",
	"preview","previewUrl","stream","streamUrl","playlist","manifest",
	"\0"
};
char raw[MAXURLSIZE];
char *p;
int found=0;
int i;

	if(!bAggressiveIndexMode || !html || !baseHost || !fromHost)
		return 0;

	for(i=0; keys[i][0] != '\0'; i++)
	{
		p = html;
		while((p = my_stristr(p, (char*)keys[i])) != NULL)
		{
			char *q = p + strlen(keys[i]);
			size_t n=0;

			while(*q && isspace((unsigned char)*q))
				q++;
			if(*q!=':' && *q!='=')
			{
				p++;
				continue;
			}
			q++;
			while(*q && isspace((unsigned char)*q))
				q++;
			if(*q=='\"' || *q=='\'')
			{
				char quote=*q++;
				while(*q && *q!=quote && n+1<sizeof(raw))
					raw[n++]=*q++;
			}
			else
			{
				while(*q && *q!=',' && *q!='}' && *q!=']' && *q!=';' &&
				      !isspace((unsigned char)*q) && n+1<sizeof(raw))
					raw[n++]=*q++;
			}
			raw[n]='\0';
			if(n>1 && OwsQueueAssetUrl(raw, baseHost, fromHost, fromHost->level, NULL, "structured"))
			{
				gProfileSourceLoose++;
				found++;
			}
			p = q;
		}
	}

	return found;
}

static int OwsBareTokenHasSlash(char *p)
{
int i;

	if(!p)
		return 0;
	for(i=0; p[i]; i++)
	{
		if(p[i]=='\"' || p[i]=='\'' || p[i]=='<' || p[i]=='>' ||
		   p[i]==')' || p[i]==']' || p[i]=='}' || p[i]==',' ||
		   p[i]==';' || isspace((unsigned char)p[i]))
			return 0;
		if(p[i]=='/')
			return 1;
	}
	return 0;
}

static int OwsIsBareUrlStart(char *p)
{
	if(!p || !*p)
		return 0;
	if(strnicmp(p,"http://",7)==0 || strnicmp(p,"https://",8)==0 || strnicmp(p,"//",2)==0)
		return 1;
	if(*p=='/' && p[1] && p[1]!='/' && p[1]!='>' && p[1]!='<' && !isspace((unsigned char)p[1]))
		return 1;
	if(strnicmp(p,"./",2)==0 || strnicmp(p,"../",3)==0)
		return 1;
	if(bAggressiveIndexMode && (isalnum((unsigned char)*p) || *p=='_' || *p=='-' || *p=='.') &&
	   OwsBareTokenHasSlash(p) && !OwsContainsCiLocal(p,"javascript:") &&
	   !OwsContainsCiLocal(p,"mailto:") && !OwsContainsCiLocal(p,"tel:"))
		return 1;
	return 0;
}

static int OwsScanBareAssetUrls(char *html, struct sHost *baseHost, struct sHost *fromHost)
{
char raw[MAXURLSIZE];
char *p;
int found=0;

	if(!html || !baseHost || !fromHost)
		return 0;

	for(p=html; *p; p++)
	{
		size_t n=0;
		if(!OwsIsBareUrlStart(p))
			continue;

		while(p[n] && n+1<sizeof(raw) &&
		      p[n]!='\"' && p[n]!='\'' && p[n]!='<' && p[n]!='>' &&
		      p[n]!=')' && p[n]!=']' && p[n]!='}' && p[n]!=',' &&
		      p[n]!=';' && !isspace((unsigned char)p[n]))
		{
			raw[n]=p[n];
			n++;
		}
		raw[n]='\0';
		while(n>0 && (raw[n-1]=='.' || raw[n-1]==':' || raw[n-1]=='!' || raw[n-1]=='?'))
			raw[--n]='\0';
		if(n>1 && OwsQueueAssetUrl(raw, baseHost, fromHost, fromHost->level, NULL, "bare"))
		{
			gProfileSourceBare++;
			found++;
		}
		if(n>0)
			p += n - 1;
	}

	return found;
}

static int OwsFastDiscoveryAllowsTag(const char *tag, const char *attr)
{
	if(bAggressiveIndexMode)
		return 1;
	if(!tag || !attr)
		return 0;
	if(stricmp((char*)tag,"base")==0 && stricmp((char*)attr,"href")==0)
		return 1;
	if(stricmp((char*)tag,"a")==0 && stricmp((char*)attr,"href")==0)
		return 1;
	if(stricmp((char*)tag,"ref")==0 && stricmp((char*)attr,"href")==0)
		return 1;
	if(stricmp((char*)tag,"area")==0 && stricmp((char*)attr,"href")==0)
		return 1;
	if(stricmp((char*)tag,"frame")==0 && stricmp((char*)attr,"src")==0)
		return 1;
	if(stricmp((char*)tag,"iframe")==0 && stricmp((char*)attr,"src")==0)
		return 1;
	return 0;
}


/* LookForUrls
 * html -> AddUrl() <-
 */
int LookForUrls(char *html,struct sHost hst)
{
char a2a[MAXTAGLENGTH];			//<a>...........</a>
char tmpurl[MAXURLSIZE];
char trimurl[MAXURLSIZE];
char encodedurl[MAXURLSIZE];
char fnd[MAXDESCRIPTIONSIZE];
char strComment[MAXDESCRIPTIONSIZE];
int strlenhtml;
int c,i,x,tmpc,y;
struct sHost tmphst;
int apix=0;
int stage;
int nUrlFound=0;
int nTagFound=0;
int nSrcsetFound=0;
int nLooseFound=0;
int nBareFound=0;
int nStructuredFound=0;
struct sHost sBaseHref;
struct sHost* sReferringHost=&hst;
char profileMsg[1024];
char jsonMsg[1536];
char jHost[160], jPage[360];

    RemoveTag(html,"<!--","-->");

    strlenhtml=strlen(html);

	for(y=0;taglist[y].flag!=-1;y++)
	{
	if(!OwsFastDiscoveryAllowsTag(taglist[y].bTag, taglist[y].eTag))
		continue;
	tmpc=c=0;
		
		while(c<strlenhtml)
		{
			if((tmpc = BetweenTag(html+c,taglist[y].bTag,a2a,taglist[y].flag ,sizeof(a2a)))==-1)
				break;			

			ReplaceChr(a2a,'\n',' ');
			ReplaceChr(a2a,'\r',' ');

   			c += tmpc+strlen(taglist[y].bTag);     
				
			stage=0;
			x=0;
			apix=0;

			for(i=0;i<(signed)strlen(a2a);i++)
			{
				switch(stage)
				{
				case 0:		//looks for start tag
					if(strnicmp(a2a+i,taglist[y].eTag,strlen(taglist[y].eTag))==0)
					{
						stage=1;	//start tag found
						i+=strlen(taglist[y].eTag);
						memset(tmpurl,0,sizeof(tmpurl));
					}
					break;
				case 1:		//looks for '\"' or '\'' 
					if(a2a[i]=='\"' || a2a[i]=='\'' )					//start '"' found
					{
						stage=2;
						apix=1;
						break;
					}
					else
					if(a2a[i]=='=')
						break;
					if(a2a[i]!=' ' && a2a[i]!='\n' && a2a[i]!='\r')		//If a2a[i] is not a delimiter consider it as data (apix=0)
					{
						stage=2;
						i--;
					}
					break;
				case 2:
                    /* URL too long */
					if(x>=MAXURLSIZE-1)
					{
						stage=3;
						tmpurl[0]=0;
						break;
					}

					if(apix==1) /* "<a href="test.htm">test</a>" OR "<a href='test.htm'>test</a>" */
					{
						if(a2a[i]!='\"' && a2a[i]!='\'' )	//while end '"' is not found
						{
							tmpurl[x++]=a2a[i];
							break;
						}
						else								//end '"' found
						{
							stage=3;
							tmpurl[x]=0;
							break;
						}
					}
					else    /* "<a href=test.htm>test</a>" */
					{
						/*if(strlen(a2a+i)!=1)
						{
							tmpurl[x++]=a2a[i];
							stage=3;
							tmpurl[x]=0;
							break;
						}
						else
                        */
						if(a2a[i]!=' ' && a2a[i]!='>' && a2a[i]!='\"' && a2a[i]!='\'' && strlen(a2a+i)!=1)
						{
							tmpurl[x++]=a2a[i];
							break;
						}
						else								//end '"' found
						{
							stage=3;
							tmpurl[x]=0;
							break;
						}
					}
				} /*switch*/
				if(stage==3)							//exits from for{}
					break;
			} /*for*/
		
		if(stage==3)
			if(tmpurl[0]!=0 && strnicmp(tmpurl,"javascript:",11)!=0)
			{

				strtrim(tmpurl, trimurl);
				if(my_stristr(taglist[y].eTag,"srcset") != NULL)
				{
					char srcsetUrl[MAXURLSIZE];
					const char *srcsetNext = trimurl;
					while((srcsetNext = OwsExtractNextSrcsetUrl(srcsetNext, srcsetUrl, sizeof(srcsetUrl))) != NULL)
					{
						if(OwsQueueAssetUrl(srcsetUrl, sReferringHost, &hst, hst.level, NULL, "srcset"))
						{
							gProfileSourceSrcset++;
							nUrlFound++;
							nSrcsetFound++;
						}
					}
					continue;
				}
				memset(encodedurl,0,sizeof(encodedurl));
				unencode(trimurl,trimurl+strlen(trimurl),encodedurl);	//Support 4 unicode

				fnd[0]=0;
				if(stricmp(taglist[y].bTag,"base")==0)	//if TAG is BASE
				{
					if(ParseUrl(encodedurl,&sBaseHref,NULL)==-1)
						continue;

                    /* sReferringHost has the same hostname and port and has an host_id */
                    if( sReferringHost && sReferringHost->host_id!=0 && strcmp( sReferringHost->Host, sBaseHref.Host ) == 0 && sReferringHost->port == sBaseHref.port )
                    {
                        /* yes: this page is from the same domain: use currentHost host_id */
                        sBaseHref.host_id = sReferringHost->host_id;
                    }

					sReferringHost=&sBaseHref;

					continue;

				}
				else
					if(stricmp(taglist[y].bTag,"a")==0)	//if TAG is A
					{
						//This shit is needed cause a2a doesn't start and doesn't end respectively with <>
						if(strlen(a2a) + 1 >= sizeof(a2a))
							continue;
						a2a[0]='<';
						strcat(a2a,">");

					//looks for the href's comment <a href...>XXX</a>
					UnHtml(a2a,fnd,sizeof(fnd));

					UnToken(fnd,"\r\n\t",strComment,strlen(fnd));

					strtrim(strComment,strComment);

					OnlyOneSpace(strComment,fnd,sizeof(fnd));
				}

				if(ParseUrl(encodedurl,&tmphst,sReferringHost)==-1)
					continue;

				if(bTokenIn(encodedurl,"<>\r\n\t\\",strlen(encodedurl))==0)
				{
					tmphst.viewed = 0;	
					memcpy(tmphst.Description,fnd,MAXDESCRIPTIONSIZE-1);

					if(tmphst.type != 3)	//Add only HTML or plain/text file or custom html files
					{
						/*if(strchr(tmphst.Page,' ')>tmphst.Page)
							printf("\n\nasd\n\n");
						*/

						nUrlFound++;
						nTagFound++;
						gProfileSourceTag++;
						AddUrl(tmphst,hst.level,&hst);
					}
				}
			}
		}
	}

	if(bAggressiveIndexMode)
	{
		nLooseFound = OwsScanLooseAssetUrls(html, sReferringHost, &hst);
		nStructuredFound = OwsScanStructuredMediaFields(html, sReferringHost, &hst);
		nBareFound = OwsScanBareAssetUrls(html, sReferringHost, &hst);
		nUrlFound += nLooseFound;
		nUrlFound += nStructuredFound;
		nUrlFound += nBareFound;
	}
	snprintf(profileMsg,sizeof(profileMsg),
	         "page discovery host=%.100s page=%.255s total=%d tag=%d srcset=%d loose=%d structured=%d bare=%d",
	         hst.Host,hst.Page,nUrlFound,nTagFound,nSrcsetFound,nLooseFound,nStructuredFound,nBareFound);
	CRAWLER_PROFILE_LOG(profileMsg);
	OwsJsonEscapeCopy(hst.Host, jHost, sizeof(jHost));
	OwsJsonEscapeCopy(hst.Page, jPage, sizeof(jPage));
	snprintf(jsonMsg,sizeof(jsonMsg),
	         "{\"event\":\"page_discovery\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"total\":%d,\"tag\":%d,\"srcset\":%d,\"loose\":%d,\"structured\":%d,\"bare\":%d}",
	         jHost,jPage,nUrlFound,nTagFound,nSrcsetFound,nLooseFound,nStructuredFound,nBareFound);
	CRAWLER_PROFILE_JSONL(jsonMsg);

return nUrlFound;
}

#endif

/*EOF*/
