
/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.8
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */


/* robots.txt example file

	User-agent: *
	Disallow: /cgi-bin/
	Disallow: /tmp/
	Disallow: /private/

	User-agent : OpenWebSpider
	Crawl-Delay: 5
	Disallow: /private/

*/

#ifndef __ROBOTSTXT
#define __ROBOTSTXT

int ParseRobotsSitemaps(char* html,struct sHost host)
{
char* pCur = html;
char* pCRLF;
char sSitemap[MAXURLSIZE];
char sTrimSitemap[MAXURLSIZE];
struct sHost smHost;
int queued = 0;

	if(html == NULL)
		return 0;

	while((pCur = my_stristr(pCur, "sitemap:")) != NULL)
	{
		pCur += 8;

		pCRLF = strstr(pCur,"\r\n");
		if(pCRLF==NULL)
			pCRLF = strstr(pCur,"\n");
		if(pCRLF==NULL)
			pCRLF = pCur + strlen(pCur);

		if(pCRLF > pCur && (pCRLF-pCur) < MAXURLSIZE-1)
		{
			memset(sSitemap,0,sizeof(sSitemap));
			strncpy(sSitemap,pCur,pCRLF-pCur);

			strtrim(sSitemap,sTrimSitemap);
			if(ParseUrl(sTrimSitemap,&smHost,&host)!=-1)
			{
				if(AddUrl(smHost,host.level+1,&host)==1)
				{
					char profileMsg[1024];
					queued++;
					gProfileSourceSitemap++;
					snprintf(profileMsg,sizeof(profileMsg),
					         "discover source=robots_sitemap host=%.100s page=%.255s from=%.100s",
					         smHost.Host,smHost.Page,host.Host);
					CRAWLER_PROFILE_LOG(profileMsg);
				}

				printf("   - Sitemap: %s\r\n",sTrimSitemap);
			}
		}

		pCur = pCRLF;
	}

	return queued;
}

int ParseSitemapXML(char* html,struct sHost host, unsigned int level)
{
char* pCur = html;
char* pEnd;
char sLoc[MAXURLSIZE];
char sTrimLoc[MAXURLSIZE];
char sFixedLoc[MAXURLSIZE];
struct sHost locHost;
int queued = 0;
	static const char *extraLocTags[] = {
		"<image:loc>","</image:loc>",
		"<video:content_loc>","</video:content_loc>",
		"<video:player_loc>","</video:player_loc>",
		"<video:thumbnail_loc>","</video:thumbnail_loc>",
		"<video:gallery_loc>","</video:gallery_loc>",
		"<video:price>","</video:price>",
		"<news:loc>","</news:loc>",
		"\0","\0"
	};
	static const char *attrNames[] = {
		"href=",
		"url=",
		"src=",
		"content=",
		"content_url=",
		"thumbnail_url=",
		"poster=",
		"\0"
	};
	int tagIndex;
	int attrIndex;

	if(html == NULL)
		return 0;

	if(my_stristr(html,"<urlset") < html &&
	   my_stristr(html,"<sitemapindex") < html)
		return 0;

	while((pCur = my_stristr(pCur,"<loc>")) != NULL)
	{
		pCur += 5;
		pEnd = my_stristr(pCur,"</loc>");
		if(pEnd < pCur)
			break;

		if((pEnd-pCur) > 0 && (pEnd-pCur) < MAXURLSIZE-1)
		{
			memset(sLoc,0,sizeof(sLoc));
			memset(sFixedLoc,0,sizeof(sFixedLoc));
			strncpy(sLoc,pCur,pEnd-pCur);
			strtrim(sLoc,sTrimLoc);
			ReplaceStr(sTrimLoc,sFixedLoc,"&amp;","&");

			if(ParseUrl(sFixedLoc,&locHost,&host)!=-1)
			{
				if(AddUrl(locHost,level+1,&host)==1)
				{
					char profileMsg[1024];
					queued++;
					gProfileSourceSitemap++;
					snprintf(profileMsg,sizeof(profileMsg),
					         "discover source=sitemap_loc host=%.100s page=%.255s from=%.100s",
					         locHost.Host,locHost.Page,host.Host);
					CRAWLER_PROFILE_LOG(profileMsg);
				}
			}
		}

		pCur = pEnd + 6;
	}

	for(tagIndex=0; extraLocTags[tagIndex][0] != '\0'; tagIndex += 2)
	{
		pCur = html;
		while((pCur = my_stristr(pCur,(char*)extraLocTags[tagIndex])) != NULL)
		{
			pCur += strlen(extraLocTags[tagIndex]);
			pEnd = my_stristr(pCur,(char*)extraLocTags[tagIndex+1]);
			if(pEnd < pCur)
				break;
			if((pEnd-pCur) > 0 && (pEnd-pCur) < MAXURLSIZE-1)
			{
				memset(sLoc,0,sizeof(sLoc));
				memset(sFixedLoc,0,sizeof(sFixedLoc));
				strncpy(sLoc,pCur,pEnd-pCur);
				strtrim(sLoc,sTrimLoc);
				ReplaceStr(sTrimLoc,sFixedLoc,"&amp;","&");
				if(ParseUrl(sFixedLoc,&locHost,&host)!=-1)
				{
					if(AddUrl(locHost,level+1,&host)==1)
					{
						char profileMsg[1024];
						queued++;
						gProfileSourceSitemap++;
						snprintf(profileMsg,sizeof(profileMsg),
						         "discover source=sitemap_media_loc host=%.100s page=%.255s from=%.100s",
						         locHost.Host,locHost.Page,host.Host);
						CRAWLER_PROFILE_LOG(profileMsg);
					}
				}
			}
			pCur = pEnd + strlen(extraLocTags[tagIndex+1]);
		}
		}

		if(bAggressiveIndexMode)
		{
			for(attrIndex=0; attrNames[attrIndex][0] != '\0'; attrIndex++)
			{
				pCur = html;
				while((pCur = my_stristr(pCur,(char*)attrNames[attrIndex])) != NULL)
				{
					char quote;
					size_t n=0;
					pCur += strlen(attrNames[attrIndex]);
					while(*pCur && (*pCur==' ' || *pCur=='\t' || *pCur=='\r' || *pCur=='\n'))
						pCur++;
					if(*pCur!='\"' && *pCur!='\'')
						continue;
					quote = *pCur++;
					memset(sLoc,0,sizeof(sLoc));
					memset(sFixedLoc,0,sizeof(sFixedLoc));
					while(*pCur && *pCur!=quote && n+1<sizeof(sLoc))
						sLoc[n++]=*pCur++;
					sLoc[n]='\0';
					strtrim(sLoc,sTrimLoc);
					ReplaceStr(sTrimLoc,sFixedLoc,"&amp;","&");
					if((my_stristr(sFixedLoc,"http://") || my_stristr(sFixedLoc,"https://") ||
					    sFixedLoc[0]=='/' || my_stristr(sFixedLoc,"//")) &&
					   ParseUrl(sFixedLoc,&locHost,&host)!=-1)
					{
						if(AddUrl(locHost,level+1,&host)==1)
						{
							char profileMsg[1024];
							queued++;
							gProfileSourceSitemap++;
							snprintf(profileMsg,sizeof(profileMsg),
							         "discover source=sitemap_attr host=%.100s page=%.255s from=%.100s attr=%.40s",
							         locHost.Host,locHost.Page,host.Host,attrNames[attrIndex]);
							CRAWLER_PROFILE_LOG(profileMsg);
						}
					}
				}
			}
		}
	
		return queued;
	}


int ParseRobotsTxt(char* html,struct sHost host)
{
char* pCur;
char* pCRLF;
int i;
int iHtmlLen;
char sDisallow[MAXPAGESIZE];
char sTrimDisallow[MAXPAGESIZE];
struct sHost locHost;
int pos=0;

	printf(" + Parsing robots.txt\r\n");
	ParseRobotsSitemaps(html,host);
	_strlwr(html);

	ReplaceChr(html,'\t',' ');
	OnlyOneSpace(html,html,MAXPACKETSIZE);
 
	if((pCur=my_stristr(html,"user-agent: openwebspider"))<html)
		pCur=my_stristr(html,"user-agent: *");

	if(pCur<html)
		return 0;

	pCur+=13;
	iHtmlLen=strlen(pCur);

	for(i=0;i<iHtmlLen;i++)
	{
		if(pos==MAXDISALLOW)
			return 1;

		if(strnicmp(pCur+i,"disallow:",9)==0)
		{
			pCRLF=strstr(pCur+i,"\r\n");

			if(pCRLF==NULL)
				pCRLF=strstr(pCur+i,"\n");

			if(strchr(pCur+i,'#')>pCur+i && strchr(pCur+i,'#')<pCRLF)
				pCRLF=strchr(pCur+i,'#');

			if(pCRLF && pCRLF-(pCur+i+10)>0 && pCRLF-(pCur+i+10)<MAXPAGESIZE-1 && pos<MAXDISALLOW)
			{
				memset(sDisallow,0,MAXPAGESIZE);
				strncpy(sDisallow,pCur+i+10,pCRLF-(pCur+i+10));

				if(ParseUrl(strtrim(sDisallow,sTrimDisallow),&locHost,&host)==-1)
				{
					i+=(pCRLF-(pCur+i));
					continue;
				}

				strcpy(lstRobotsExclusions[pos++],locHost.Page);
				printf("   - Disallow: %s\r\n",locHost.Page);

				i+=(pCRLF-(pCur+i));
			}
		
		}
		else if(strnicmp(pCur+i,"crawl-delay:",12)==0)
		{
			pCRLF=strstr(pCur+i,"\r\n");

			if(pCRLF==NULL)
				pCRLF=strstr(pCur+i,"\n");

			if(strchr(pCur+i,'#')>pCur+i && strchr(pCur+i,'#')<pCRLF)
				pCRLF=strchr(pCur+i,'#');

			/*TODO: trim(page)*/
			if(pCRLF && pCRLF-(pCur+i+13)>0 && pCRLF-(pCur+i+13)<4-1 && pos<4)
			{
			char sTmpCrawlDelay[5];

				memset(sTmpCrawlDelay,0,sizeof(sTmpCrawlDelay));
				strncpy(sTmpCrawlDelay,pCur+i+13,pCRLF-(pCur+i+13));
				iRobCrawlDelay=atoi(sTmpCrawlDelay);

				printf("   - Crawl Delay: %i\r\n", iRobCrawlDelay);

				i+=(pCRLF-(pCur+i));
			}
		}
		else if(strnicmp(pCur+i,"user-agent:",11)==0)
			return 1;
	}

return 1;
}

int CheckRobotExclusion(char* page)
{
int i;
	for(i=0;i<MAXDISALLOW && lstRobotsExclusions[i][0]!=0;i++)
	{
		if(strlen(page)>=strlen(lstRobotsExclusions[i]))
			if(strnicmp(lstRobotsExclusions[i],page,strlen(lstRobotsExclusions[i]))==0)
				return 0;
	}
return 1;
}

#endif


/*EOF*/
