
/* OpenWebSpider
 *
 *  Author:     Stefano Alimonti aka Shen139
 *  Version:    0.5.1
 *  Mail:       shen139 [at] openwebspider (dot) org
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/*
 *OpenWebSpider option '-T' will ignore this module
 */

/*
 *
$ gcc -g -c regexFilter.c
$ gcc -g -shared -W1,-soname,regexFilter.so.0 -o regexFilter.so regexFilter.o -lc
||
$ make mod_regexfilter
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../modHeader.h"
#include "../../platform.h"

#include "../../regex.h"

typedef struct __filter
{
	regex_t hostFilter;
	unsigned short int bHost;
	regex_t pageFilter;
	unsigned short int bPage;
	regex_t htmlFilter;
	unsigned short int bHtml;
	regex_t textFilter;
	unsigned short int bText;
}_FILTER;

struct __filter rFilter;

#ifndef WIN32
int stricmp(char*a,char*b)
{
return strcasecmp(a,b);
}

int strnicmp(char*a,char*b,int c)
{
return strncasecmp(a,b,c);
}
#endif


int UnToken(char* str,char* Tokens,char* out,int len)
{
int c,i,x=0,tokenfound,y;

y=MIN(len,(signed)strlen(str));

	for(c=0;c<y;c++)
	{
		tokenfound=0;
		for(i=0;i<(signed)strlen(Tokens);i++)
			if(str[c]==Tokens[i])
				tokenfound=1;
		if(tokenfound==0)
			out[x++]=str[c];
	}
	out[x]=0;
return 1;
}

/*
 return: 0   -> OK
       : > 0 -> error while parsing file at line $iLine
*/
int ParseRegexConf(FILE* pF,char* hostname)
{
char sLine[500];
char sString[500];
int iLine=0;
int bOpenEntry=0;
int bSpecificHostname=0;

	while(!feof(pF))
	{
		memset(sLine,0,sizeof(sLine));

		fgets(sLine,499,pF);
		iLine++;

		if(strnicmp(sLine,"[]",2)==0)
		{
			//OK parse the global path
			bOpenEntry=1;

			/*global regex*/
			if(bSpecificHostname==1)
				return 0;
			else
				rFilter.bHost=rFilter.bPage=rFilter.bHtml=rFilter.bText=0;
		}
		else
		if(sLine[0]=='[' && strchr(sLine+1,']')>sLine)
		{
		char currentHostname[500];

			memset(currentHostname,0,500);
			strncpy( currentHostname, sLine+1, strchr(sLine+1,']')-sLine-1 );

			if(strcmp(currentHostname,hostname)==0)
			{
				bSpecificHostname=1;
				bOpenEntry=1;
				rFilter.bHost=rFilter.bPage=rFilter.bHtml=rFilter.bText=0;
			}
			else
				bOpenEntry=0;
		}
		else
		if(sLine[0]=='#' || sLine[0]=='\r' || sLine[0]=='\n' || sLine[0]==0)
			continue;
		else
		if(bOpenEntry==0)
			continue;
		else
		if(strnicmp(sLine,"hostname=",9)==0)
		{
			UnToken(sLine+9,"\r\n",sString,499);
			if(strlen(sString)==0)
				continue;
			if(regcomp(&rFilter.hostFilter,sString ,REG_EXTENDED) != 0)
				return iLine;
			rFilter.bHost=1;
		}
		else
		if(strnicmp(sLine,"page=",5)==0)
		{
			UnToken(sLine+5,"\r\n",sString,499);
			if(strlen(sString)==0)
				continue;
			if(regcomp(&rFilter.pageFilter,sString ,REG_EXTENDED) != 0)
				return iLine;

			rFilter.bPage=1;
		}
		else
		if(strnicmp(sLine,"html=",5)==0)
		{
			UnToken(sLine+5,"\r\n",sString,499);
			if(strlen(sString)==0)
				continue;
			if(regcomp(&rFilter.htmlFilter,sString ,REG_EXTENDED) != 0)
				return iLine;
			rFilter.bHtml=1;
		}
		else
		if(strnicmp(sLine,"text=",5)==0)
		{
			UnToken(sLine+5,"\r\n",sString,499);
			if(strlen(sString)==0)
				continue;
			if(regcomp(&rFilter.textFilter,sString ,REG_EXTENDED) != 0)
				return iLine;
			rFilter.bText=1;
		}
		else
			return iLine;
	}

return 0;
}

/* modFilter should return 1 if the current page must be indexed 0 if discarded*/
int modFilter (struct functArg* arg)
{
	if(arg)
	{
		if(rFilter.bHost==1)
			if(regexec(&rFilter.hostFilter, arg->hostInfo->Host, 0, 0, 0) != 0)	//do not match? => don't index
				return 0;
		if(rFilter.bPage==1)
			if(regexec(&rFilter.pageFilter, arg->hostInfo->Page, 0, 0, 0) != 0)	//do not match? => don't index
				return 0;
		if(rFilter.bHtml==1)
			if(regexec(&rFilter.htmlFilter, arg->html, 0, 0, 0) != 0)	//do not match? => don't index
				return 0;
		if(rFilter.bText==1)
			if(regexec(&rFilter.textFilter, arg->text, 0, 0, 0) != 0)	//do not match? => don't index
				return 0;

		strcpy(arg->html,"prova");
		strcpy(arg->text,"prova testo");


		return 1;	/*OK... index*/
	}

return 0;	/*don't index*/
}

int modInitFilter (char* hostname, char* error)
{
FILE* pF;
int errLine;

	pF=ows_fopen_config("mod_regex.conf","r",NULL,0);
	if(pF==NULL)
	{
		sprintf(error,"File not found: mod_regex.conf or %s/mod_regex.conf", OWS_SYSCONFDIR);
		return 0;
	}

	if( (errLine=ParseRegexConf(pF, hostname))>0 )
	{
		sprintf(error,"Error while parsing mod_regex.conf at line: %i for hostname: %s",errLine, hostname);
		return 0;
	}

	fclose(pF);

return 1;
}
