/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.8
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *  
 * 
 *  Compile with
 *  + Linux:  $ gcc openwebspider-0.6.c -o openwebspider `mysql_config --cflags --libs` -lpthread -ldl -rdynamic -Wall
 *   - mysql-devel needed
 *  + Windows: Microsoft Visual C++ 6.0
 *
 *
 * Web Site: http://www.openwebspider.org/
 * 
 *
 * FAQ about Robots and Search engine here: http://www.robotstxt.org/wc/faq.html
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *
 */

#define AUTHOR          Shen139 AND Fantin
#define VERSION         "0.8"
#define DBVERSION       1

#define USE_REGEX

#if defined(_WIN32) && !defined(WIN32)
#define WIN32
#endif

#include "platform.h"

#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <time.h>
#if defined(__linux__)
#include <malloc.h>
#endif
#include <ctype.h>
#include <stdarg.h>
#include <limits.h>
#include <errno.h>
#include "regex.h"

#ifndef __has_feature
#define __has_feature(x) 0
#endif

#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer)
/* In alcuni ambienti (ptrace/sandbox) LeakSanitizer termina il processo a fine run.
   Disabilitiamo solo il leak check per avere esecuzioni ASan stabili. */
const char *__asan_default_options(void)
{
    return "detect_leaks=0";
}
#endif

#ifdef WIN32
  #ifndef WIN32_LEAN_AND_MEAN
  #define WIN32_LEAN_AND_MEAN
  #endif
  #include <winsock2.h>
  #include <ws2tcpip.h>
  #include <process.h>
  #include <windows.h>
  #include "snprintf.c"

  #pragma comment(lib,"libmySQL.lib")
  #pragma comment(lib,"Ws2_32.lib")
  /****************************************************************************/
  #include "mysql/mysql.h"
  /****************************************************************************/

#else /*linux*/
  #define _MULTI_THREADED
  #include <pthread.h>
  #include <sched.h>
  #include <sys/time.h>
  #include <unistd.h>
  #if defined(__has_include)
    #if __has_include(<mysql/mysql.h>)
      #include <mysql/mysql.h>
    #elif __has_include(<mysql.h>)
      #include <mysql.h>
    #elif __has_include(<mariadb/mysql.h>)
      #include <mariadb/mysql.h>
    #else
      #error "MySQL/MariaDB headers not found"
    #endif
  #else
    #include <mysql/mysql.h>
  #endif
  #include <pthread.h>
  #include <sys/types.h>
  #include <netinet/in.h>
  #include <netdb.h>
  #include <dlfcn.h>
  #include <string.h>
  #include <sys/socket.h>
  #include <arpa/inet.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#endif

#include "functions.h"

#include "mymutex.h"
#include "list.h"
#include "hstlist.h"
#include "htmlfnct.h"
#include "socket.h"
#include "sqlfnct.h"
#include "getopt.h"
#include "thread.h"
#include "misc.h"
#include "robots.h"
#include "rank.h"
#include "urlfunct.h"
#include "temptable.h"
#include "modules.h"
#include "sqlfnct.h"
#include "strfnct.h"
#include "server.h"

SSL_CTX *g_ssl_ctx = NULL;
int gDb2FatalError = 0;
#include "search.h"
#include "parse_conf.h"
#include "indexer.h"


int usage(char *txt)
{
	printf("\n\nUsage: openwebspider [Arguments]\r\n");
	printf("Arguments:\r\n");
	printf("-I [Search string] (*) (Search the word or the words(between \"double-quotes\") from the database)\r\n");
	printf(" or\r\n");
	printf(" -O [hostname (optional)] (Build OWS own index on all un-indexed pages or on a single hostname if specified)\r\n");	
	printf(" or\r\n");
	printf("-i [start url] (*) (Start indexing pages starting from passed url)\r\n");
	printf("-A (Aggressive attachment/media discovery; slower)\r\n");
	printf("-t [Number of threads] (Default: 20)\r\n");
	printf("-s (Single Host Mode)\r\n");
	printf("-m [Limits the maximum level of depth in the tree of the pages] [Default: 0 (No limit)]\r\n");
    printf("-l (Limits the maximum number of pages indexed per site) [Default: 0 (No limit)]\r\n");
    printf("-c (Limits the maximum number of seconds per site) [Default: 0 (No limit)]\r\n");
    printf("-b (Limits the maximum number of bytes downloaded per site) [Default: 0 (No limit)]\r\n");
	printf("-E (Limits the maximum number of error codes get when downloading a page) [Default: 0 (No limit)]\r\n");
	printf("-e (Doesn't Add External Host)\r\n");
	printf("-F (Free indexing mode)\r\n");
	printf("-x (Saves a cache of the html page (full html)) (slow)\r\n");
	printf("-z (Saves a Compressed cache of the html page (full html)) (slow)\r\n");
	printf("-f [module] (Import loadable functions from the library)\r\n");
	printf("-X [eXtension(s)] (Set all the extensions that openwebspider must consider (eg. -X pdf,swf))\r\n");
	printf("-u (index only new pages (Update))\r\n");
	printf("-T (Testing Mode) No data (pages and rels) will be written into the DB\r\n");
	printf("-r [0-1-2](Saves relationships between pages (Default: 1))\r\n    0: doesn't save relationships\r\n    1: saves only relationships between hosts\r\n    2: saves all relationships (between hosts and pages)\r\n");
	printf("-n (No index pages) Don't index pages\r\n");
	printf("-d [0-%i ms (Crawl Delay)] (Default: 0)\r\n",MAXCRAWLDELAY*1000);
	printf("-S [TCP PORT] (Act as a server to get commands)\r\n");
    printf("-o [OpenWebSpider Own Index] (Build the OWS own index)\r\n");
	printf("--\r\n-p [path] (specify the full path of the configuration file (eg.: \"/etc/openwebspider/openwebspider.conf\"))\r\n");
	printf("\r\n(*) Arguments needed\r\n");

	fprintf(stderr,"\r\n\r\nERROR: %s\r\n\r\n",txt);
	

exit(0);
}


void sigdie(int a)
{
	printf("\r\n\r\nCaught signal n.%i\r\n\r\n",a);
	if(a==15)
	{
		printf("\r\nExiting...\r\n");
		exit(0);
	}
	iQuit=1;

return;
}

static int IsValidStartHost(const char *host)
{
    size_t i, len;
    int hasAlnum = 0;

    if (host == NULL)
        return 0;

    len = strlen(host);
    if (len == 0)
        return 0;

    if (host[0] == '.' || host[len - 1] == '.')
        return 0;

    if ((len == 3 &&
         (host[0] == 'w' || host[0] == 'W') &&
         (host[1] == 'w' || host[1] == 'W') &&
         (host[2] == 'w' || host[2] == 'W')) ||
        (len == 4 &&
         (host[0] == 'w' || host[0] == 'W') &&
         (host[1] == 'w' || host[1] == 'W') &&
         (host[2] == 'w' || host[2] == 'W') &&
         host[3] == '.'))
        return 0;

    for (i = 0; i < len; i++)
    {
        unsigned char c = (unsigned char)host[i];

        if (isalnum(c))
            hasAlnum = 1;

        if (!(isalnum(c) || c == '-' || c == '.' || c == ':' || c == '[' || c == ']'))
            return 0;

        if (c == '.' && i > 0 && host[i - 1] == '.')
            return 0;
    }

    return hasAlnum;
}

static int ParseUnsignedLimit(const char *value, OWS_DOWNLOAD_SIZE maxValue, OWS_DOWNLOAD_SIZE *out)
{
    char *end = NULL;
    unsigned long long parsed;

    if (!value || !out || value[0] == '\0' || value[0] == '-')
        return 0;

    errno = 0;
    parsed = strtoull(value, &end, 10);
    if (errno == ERANGE || end == value || *end != '\0' || parsed > (unsigned long long)maxValue)
        return 0;

    *out = (OWS_DOWNLOAD_SIZE)parsed;
    return 1;
}

static int ParseUnsignedIntLimit(const char *value, unsigned int maxValue, unsigned int *out)
{
    OWS_DOWNLOAD_SIZE parsed;

    if (!ParseUnsignedLimit(value, maxValue, &parsed))
        return 0;

    *out = (unsigned int)parsed;
    return 1;
}

int main(int argc, char*argv[])
{
struct sHost currentHst;
char starturl[MAXURLSIZE], *starturlTmp;
int c;
extern int optind;
char sUserQuery[MAXUSERQUERYSIZE];
char sConfFilePath[MAXURLSIZE];
unsigned int bUseAggressiveDefaults = 0;

	printf("OpenWebSpider(v%s)\r\n  Developed by Stefano Alimonti And Stefano Fantin\r\n\r\n",VERSION);

	SSL_library_init();
	SSL_load_error_strings();
	OpenSSL_add_all_algorithms();
	g_ssl_ctx = SSL_CTX_new(TLS_client_method());
	if(!g_ssl_ctx)
	{
		fprintf(stderr,"Error initializing OpenSSL\n");
		return 1;
	}

	if(argc<2)
		usage("Too few arguments");

	memset(starturl,0,MAXURLSIZE);
	memset(sConfFilePath,0,MAXURLSIZE);
	memset(&CustomExtensions,0,sizeof(CustomExtensions));

    CRAWLER_LIMITS.nMaxPagesPerSite    = 0;
    CRAWLER_LIMITS.nMaxDepthLevel      = 0;
    CRAWLER_LIMITS.nMaxSecondsPerSite  = 0;
    CRAWLER_LIMITS.nMaxBytesPerSite    = 0;
	CRAWLER_LIMITS.nMaxErrorPerSite    = 0;


	
	while ((c = getopt(argc, argv, "AIisrtmTelxRfXuzdFnSpocbEO")) != -1)
	switch (c)
	{
		case 'A':
			bUseAggressiveDefaults = 1;
		break;
		case 'I':				//indexed search
			if(scan_mode!=0)
				usage("(-I): Scan Mode redefinition");

			scan_mode=2;

			if(optind>=argc)
				usage("(-I): No enough arguments");

			if(strlen(argv[optind])>MAXUSERQUERYSIZE-1)
				usage("(-I): Query too long");
			else
			{
				strncpy(sUserQuery,argv[optind],MAXUSERQUERYSIZE-1);
				optind++;
			}

		break;
		case 'O':				//build OOI (expected argc = 2 or 3)
			if(argc > 3)
				usage("(-O): Too much arguments");

			if(scan_mode!=0)
				usage("(-O): Scan Mode redefinition");
			
			if(argc == 3)
			{
				if(strlen(argv[2])>MAXHOSTSIZE-1)
					usage("(-O): Hostname too long");
				else
					strncpy(starturl,argv[2],MAXHOSTSIZE-1);
			}

			scan_mode=3;
		break;
		case 'i':                   //Index pages
			if(scan_mode!=0)        //At startup scan_mode==0xFF => uninitialized
				usage("(-i): Scan Mode redefinition");

			scan_mode=1;

			if(optind>=argc)
				usage("(-i): No enough arguments");

			if(strlen(argv[optind])>MAXURLSIZE-1)
				usage("(-i): Url too long");
			else
			{
				strncpy(starturl,argv[optind],MAXURLSIZE-1);
				optind++;
			}

		break;
		case 'f':                   //Load library
			if(optind>=argc)
				usage("(-f): No enough arguments");

			if(strlen(argv[optind])>MAXPAGESIZE-1)
				usage("(-f): File name too long");
			else
			{
				myLoadModules(argv[optind],modHandler);
				optind++;
			}

		break;
		case 's':
			starthostonly=1;
		break;
		case 'r':                   //relationships

			if(optind>=argc)
				usage("(-r): No enough arguments");

			if(strcmp(argv[optind],"0")==0 || strcmp(argv[optind],"1")==0 || strcmp(argv[optind],"2")==0)
				nRelationships=atoi(argv[optind]);
			else
				usage("(-r): Range value 0,1,2");

			optind++;

		break;
		case 't':                   //n threads

			if(optind>=argc)
				usage("(-t): No enough arguments");

			{
				unsigned int parsedThreads;
				if(!ParseUnsignedIntLimit(argv[optind], MAXTHREAD, &parsedThreads))
					usage("(-t): Wrong number of threads");
				nThread = (int)parsedThreads;
			}

			if(nThread<1)
				usage("(-t): At least one thread");

			optind++;

		break;
		case 'm':                    //maximum level of depth

			if(optind>=argc)
				usage("(-m): No enough arguments");

			if(!ParseUnsignedIntLimit(argv[optind], UINT_MAX, &CRAWLER_LIMITS.nMaxDepthLevel))
				usage("(-m): Wrong level of depth");

			optind++;

		break;
		case 'l':					 //maximum pages per site
			if(optind>=argc)
				usage("(-l): No enough arguments");

			if(!ParseUnsignedIntLimit(argv[optind], UINT_MAX, &CRAWLER_LIMITS.nMaxPagesPerSite))
				usage("(-l): Wrong value for maximum number of pages per site");

			optind++;
		break;
        case 'c':					 //maximum seconds per site
			if(optind>=argc)
				usage("(-l): No enough arguments");

			if(!ParseUnsignedIntLimit(argv[optind], UINT_MAX, &CRAWLER_LIMITS.nMaxSecondsPerSite))
				usage("(-l): Wrong value for maximum number of seconds per site");

			optind++;
		break;
        case 'b':					 //maximum bytes downloaded per site
			if(optind>=argc)
				usage("(-l): No enough arguments");

			if(!ParseUnsignedLimit(argv[optind], UINT64_MAX, &CRAWLER_LIMITS.nMaxBytesPerSite))
				usage("(-l): Wrong value for maximum number of bytes per site");

			optind++;
		break;
		case 'E':					 //maximum error per site
			if(optind>=argc)
				usage("(-E): No enough arguments");

			if(!ParseUnsignedIntLimit(argv[optind], UINT_MAX, &CRAWLER_LIMITS.nMaxErrorPerSite))
				usage("(-E): Wrong value for maximum number of error codes");

			optind++;
		break;
		case 'x':					//Save HTML Cache
		case 'z':
			xCacheHtml=1;
			if(c=='z')
				xCacheHtmlCompressed=1;
		break;
		case 'S':                   //Act as a server

			if(optind>=argc)
				usage("(-S): No enough arguments");

			if(!ParseUnsignedIntLimit(argv[optind], 65535, &actAsAServerPort) || actAsAServerPort<1)
				usage("(-S): TCP PORT must be an integer");

			optind++;

		break;
		case 'p':                   //Path of openwebspider.conf
			if(optind>=argc)
				usage("(-p): No enough arguments");

			if(strlen(argv[optind])>MAXURLSIZE-1)
				usage("(-p): Path too long");
			else
			{
				strncpy(sConfFilePath,argv[optind],MAXURLSIZE-1);
				optind++;
			}

		break;
		case 'n':                    //Do not index pages
			bDontIndexPages=1;
		break;

		case 'T':                    //Test (doesn't write data to the DB)
			bTesting=1;
		break;
		case 'e':                    //Doesn't add external hosts
			bAddExternalHost=1;
		break;
		case 'u':                    //Update: index only new pages
			bUpdate=1;
		break;
		case 'F':                    //Free indexing mode
			bFreeIndexingMode=1;
		break;
   		case 'o':                    //OWS Own Index
			bBuildOwsOwnIndex=1;
    	break;
		case 'X':                   //Custom Extensions       (Under Construction)
			if(optind>=argc)
				usage("(-X): No enough arguments");

			if(strlen(argv[optind])>MAXCUSTOMEXTENSIONSIZE-1)
				usage("(-X): Custom extensions argument too long");
			else
			{
				/*split extensions*/
			char * pExt;
			int c=0;

				pExt = strtok (argv[optind],",");

				while (pExt != NULL)
				{
					if(c>MAXCUSTOMEXTENSIONS)
						break;
					
					if(strlen(pExt)<MAXEXTENSIONSIZE)
					{
						strcpy(CustomExtensions[c++],pExt);
					}
					pExt = strtok (NULL, ",");
				}

				optind++;
			}

		break;
		case 'd':                   //Crawl Delay

			if(optind>=argc)
				usage("(-d): No enough arguments");

			{
				unsigned int parsedDelay;
				if(!ParseUnsignedIntLimit(argv[optind], MAXCRAWLDELAY*1000, &parsedDelay))
					usage("(-d): Wrong Crawl Delay");
				iCrawlDelay = (int)parsedDelay;
			}

			optind++;

		break;
	default:
			usage("Unknown option argument");
	}

	if(scan_mode==1 && bUseAggressiveDefaults)
		OwsEnableAggressiveIndexDefaults();

    /*
    CHECKs
    */
    if( (bBuildOwsOwnIndex == 1)  &&
       ((bDontIndexPages == 1)    ||
        (bTesting == 1)           ||
        (bUpdate == 1)    )        )
            usage("Wrong mix of arguments");

    /*
    Parse Config File
    */
	if(ReadConfFile(sConfFilePath)==0)
		return 0;

	/***************************************************************/

	if(scan_mode==0)
		usage("Scan mode undefined");

	/*********************************/

	if(scan_mode==2)
	{
	MYSQL mysql;

		printf("Scan Mode:       \tIndexed\r\n");
		printf("Query:           \t%s\r\n",sUserQuery);
		printf("Surfing the DB...\r\n");

		if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2 , &mysql, MYSQLSERVER_PORT2)==0)
		{
			fprintf(stderr, "Failed to connect to database: Error: %s\n",mysql_error(&mysql));

		return 0;
		}


		return( IndexedSearch(&mysql,sUserQuery) );
	}

    /***********************/

    if(scan_mode==1)
    {
        starturlTmp=(char*)malloc(MAXURLSIZE);
        if(starturlTmp == NULL)
            usage("Memory allocation error");

        /* Se NON inizia con http:// e NON inizia con https://,
           allora pre-pendiamo http://
        */
        if (strncmp(starturl, "http://", 7) != 0 &&
    strncmp(starturl, "https://", 8) != 0)
{
    /* Se non c'è né http:// né https://, di default aggiungiamo http:// */
    snprintf(starturlTmp, MAXURLSIZE, "http://%.377s", starturl);
    strcpy(starturl, starturlTmp);
}


        /* decodifica URL in starturlTmp */
        unencode(starturl, starturl+strlen(starturl)+1, starturlTmp);
        strncpy(starturl, starturlTmp, MAXURLSIZE-1);
        starturl[MAXURLSIZE-1] = '\0';

        FREE(starturlTmp);

        if(ParseUrl(starturl,&currentHst,NULL)==-1)
            usage("Wrong start URL");

        if(!IsValidStartHost(currentHst.Host))
            usage("Wrong start URL: invalid hostname");

        strncpy(currentHst.Description,starturl,MIN(strlen(starturl),MAXDESCRIPTIONSIZE-1));
        
        /* initialize the crawler and print a banner */
        if(InitCrawler(currentHst) == -1)
        {
            printf("\nSome error occourred when trying to initialize the crawler!\n\n");
            return 0;
        }
        
        return CrawlerMainLoop(currentHst);
    }

    if(scan_mode == 3)
    {
        DWORD tStart;

        /* build OOI */

        /* connect to mysql servers */
        if(InitMysql()==-1)
            return 0;

        SetConsoleTitle("building OOI...");

        tStart = GetTickCount();

        if( starturl[0]!=0 )
        {
            starturlTmp=(char*)malloc(MAXURLSIZE);  
            if (strncmp(starturl, "http://", 7) != 0 &&
    strncmp(starturl, "https://", 8) != 0)
{
    /* Se non c'è né http:// né https://, aggiungiamo http:// di default */
    snprintf(starturlTmp, MAXURLSIZE, "http://%.377s", starturl);
    strcpy(starturl, starturlTmp);
}


            unencode(starturl,starturl+strlen(starturl)+1,starturlTmp);
            strncpy(starturl,starturlTmp,MAXURLSIZE-1);
            starturl[MAXURLSIZE-1] = '\0';

            FREE(starturlTmp);

            if(ParseUrl(starturl,&currentHst,NULL)==-1)
                usage("Wrong start URL");

            if(!IsValidStartHost(currentHst.Host))
                usage("Wrong start URL: invalid hostname");

            BuildOwsOwnIndex(&currentHst, 1);
        }
        else
            BuildOwsOwnIndex(NULL, 2);

        printf("\r\nBuilt OOI in %i ms\r\n\r\n", (int)(GetTickCount()-tStart));
    }

    return 1;
}

/*EOF*/
