
/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.8
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */


#ifndef __OPTIONS
#define __OPTIONS

#include <limits.h>
#include <stdint.h>

#if defined(_WIN32) && !defined(WIN32)
#define WIN32
#endif

#ifdef WIN32
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
 #include <winsock2.h>
 #include <ws2tcpip.h>
 #include <windows.h>
 #pragma comment(lib,"WS2_32.lib")
#else
 #include <pthread.h>
 #include <sys/types.h>
 #define  SOCKET            int
 #define SOCKADDR_IN        struct sockaddr_in
 #define LPSOCKADDR         struct sockaddr*
 #define SOCKET_ERROR       -1
 #define INVALID_SOCKET     -1
 typedef unsigned long      DWORD;
 #define LPVOID	            void*
 #define HANDLE	            pthread_t
#endif

typedef uintptr_t OWS_MUTEX_OWNER;
typedef uint64_t OWS_DOWNLOAD_SIZE;

/*Host*/
#define PORT                80
#define MAXHOSTSIZE         255
#define MAXPAGESIZE         2048
#define MAXURLSIZE          MAXHOSTSIZE + MAXPAGESIZE + 30

/*Html*/
#define MAXPACKETBUFSIZE    200000
#define	MAXPACKETSIZE       MAXPACKETBUFSIZE
#define MAXHTTPDOWNLOADSIZE 6144000
#define MAXTAGSIZE          20
#define MAXDESCRIPTIONSIZE  1024
#define MAXTAGLENGTH        50000
#define	MAXHTTPSTATUSSIZE   50

/*robots.txt*/
#define MAXDISALLOW         1000
#define MAXCRAWLDELAY       999

/*RANK*/
#define MAXPRLEV            10

/*SQL*/
#define MAXQUERYSIZE        (MAXPACKETBUFSIZE + 50000)
#define MAXUSERQUERYSIZE    200

/* minimum delay between 2 pings */
#define MYSQL_MIN_PING_DELAY 60000

/*Socket*/
#define FIRSTTIMEOUT         50000      /*MSeconds*/
#define TIMEOUTs             10000      /*MSeconds*/

/*Thread && Mutex*/
#define MAXMUTEX             10
#define	MAXTHREAD            8000
#define NO_BLOCK             -1
#define BLOCKTHRDHST         0
#define BLOCKDB1             1
#define BLOCKINDEX           2
#define BLOCKEXH             3
#define BLOCKEXCRAWL         4
#define AVGTHREADDELAY       100000	/*100 seconds*/

/* External modules */
#define MAXMSGERRORSIZE     1000

/*OWS Server*/
#define OWSSERVERMAXLOGINS  10
#define MAXCOMMANDSIZE      1000
#define MAXKEYWORDSIZE      10
#define MAXARGUMENTSIZE     MAXCOMMANDSIZE - MAXKEYWORDSIZE - 1
#define __SERVR_COMMANDERR  SEND(sock, "\r\n<div align='center'>Command not understood<div align='center'>\r\n")

/*Misc*/
#define MAXKEYSIZE          20
#define MAXEXTERNALNODE     1000
#define MAXOUTPUTLINE       500
#define MAXREGULAREXPRESSIONSIZE 100
#define MAXCUSTOMEXTENSIONS      512
#define MAXCUSTOMEXTENSIONSIZE   50
#define MAXEXTENSIONSIZE         32

/*Parse Config File*/
#define MAXCONFKEYSIZE       100
#define MAXCONFARGSIZE       100

/*Encoding*/
#define UTF8_ENCODING        0
#define ASCII_ENCODING       1

/*OWS index*/
#define OWSINDEXMINWORDSIZE  1
#define OWSINDEXMAXWORDSIZE  30
#define LEXICONWORDSIZE		 2000
/* store the index to the DB every OWSINDEXMAXSWAPDELAY pages */
#define OWSINDEXMAXSWAPDELAY 60
#define INDEXERTOKENS        " ,.;:-_@#!\"\'\\/<>^[]{}()\r\n\t*%$&=+-|!?"
unsigned int lexicon_number_of_elements;
unsigned int lexicon_actual_size;


int nThread = 20;

char	DB1[MAXCONFARGSIZE];
char	DB2[MAXCONFARGSIZE];
char	MYSQLSERVER1[MAXCONFARGSIZE];
char	MYSQLSERVER2[MAXCONFARGSIZE];
char	USERDB1[MAXCONFARGSIZE];
char	USERDB2[MAXCONFARGSIZE];
char	PASSDB1[MAXCONFARGSIZE];
char	PASSDB2[MAXCONFARGSIZE];

unsigned int MYSQLSERVER_PORT1;
unsigned int MYSQLSERVER_PORT2;

char	OWS_SERVER_PASSWORD[MAXCONFARGSIZE];

typedef struct sHost
{
    char Host[MAXHOSTSIZE];
    char Page[MAXPAGESIZE];
    char Description[MAXDESCRIPTIONSIZE];
    unsigned short int port;
    unsigned short int type;
    unsigned short int viewed;
    unsigned short int level;
    unsigned short int redirectDepth;
    unsigned short int retryCount;
    unsigned int host_id;
    unsigned short int isSSL;   /* 0 = HTTP, 1 = HTTPS */
    char HttpETag[512];
    char HttpLastModified[128];
    char HttpContentType[256];
    char HttpContentDisposition[1024];
} SHOST;


/*Current Host*/
struct sHost IndexingHost;

/*Global MySQL*/
MYSQL gMysqlDB1;
MYSQL gMysqlDB2;
char  gTable[20];

/*global Mutex*/
volatile OWS_MUTEX_OWNER hMutex[MAXMUTEX];

/*global Status of Threads*/
DWORD thrdStatus[MAXTHREAD];
/*
thrdStatus[]==0	-> Thread is alive
thrdStatus[]==1	-> Thread is dead
....
*/



typedef struct sHandleConnection
{
    SOCKET sock;
    SOCKADDR_IN client;
}SHC;

OWS_DOWNLOAD_SIZE bytesDownloaded      = 0;
unsigned int nErrorPages                = 0;
char startTime[10];
DWORD startTimeMS;

/*SWITCHes*/
unsigned int xCacheHtml            = 0;
unsigned int xCacheHtmlCompressed  = 0;
unsigned int nPagesViewed          = 0;
unsigned int nRelationships        = 1;
unsigned int bDontIndexPages       = 0;
unsigned int bTesting              = 0;
unsigned int starthostonly         = 0;
unsigned int bFreeIndexingMode     = 0;
unsigned int bAggressiveIndexMode  = 0;
unsigned int bUpdate               = 0;
unsigned int actAsAServerPort      = 0;
unsigned int bBuildOwsOwnIndex     = 0;

struct __crawler_limits
{
    unsigned int nMaxPagesPerSite;
    unsigned int nMaxDepthLevel;
    unsigned int nMaxSecondsPerSite;
    OWS_DOWNLOAD_SIZE nMaxBytesPerSite;
	unsigned int nMaxErrorPerSite;
} CRAWLER_LIMITS;

struct __extra_limits
{
    unsigned int nMaxPagesPerSite;
    unsigned int nMaxDepthLevel;
    unsigned int nMaxSecondsPerSite;
    OWS_DOWNLOAD_SIZE nMaxBytesPerSite;
} EXTRA_LIMITS;

unsigned int scan_mode=0;
/* scan_mode==0 => Real time search		//Deprecated
 * scan_mode==1 => Index
 * scan_mode==2 => Indexed search
 * scan_mode==0xFF => uninitialized
 */

/*SIGNALs*/
unsigned int iQuit                 = 0;
unsigned int bKillThread           = 0;
unsigned int bKillThreadReserved   = 0;
unsigned int iStop                 = 0;
unsigned int iDoNextHost           = 0;
/***/
struct sHost* nextHost = NULL;
/***/
unsigned int bSwapping              = 0;
unsigned int bAddExternalHost       = 0;
unsigned int bUseRegularExpressionA = 0;
unsigned int bUseRegularExpressionB = 0;

/*STRUCTUREs*/
/*
 * bTag = Tag begin for?
 * eTag = Attribute
 * flag = 0 : <tag1 attr=123> xyz </tag1> eg.: <a href="/index.php">Home</a>
 *      = 1 : <tag2 attr2="test">         eg.: <base href="http://www.openwebspider.org/">
 */
struct
{
    char* bTag;
    char* eTag;
    int   flag;
} taglist[] = 
    {
      { "base"   ,"href" ,0 },
      { "a"      ,"href" ,1 },
      { "a"      ,"data-href" ,1 },
      { "a"      ,"data-url" ,1 },
      { "ref"    ,"href" ,0 },
      { "area"   ,"href" ,0 },
      { "enclosure" ,"url" ,0 },
      { "media:content" ,"url" ,0 },
      { "media:thumbnail" ,"url" ,0 },
      { "media:player" ,"url" ,0 },
      { "form"   ,"action" ,0 },
      { "form"   ,"data-action" ,0 },
      { "meta"   ,"content" ,0 },
      { "script" ,"src"  ,0 },
      { "script" ,"data-src"  ,0 },
      { "frame"  ,"src"  ,0 },
      { "iframe" ,"src"  ,0 },
      { "iframe" ,"data-src"  ,0 },
      { "img"    ,"src"  ,0 },
      { "img"    ,"data-src"  ,0 },
      { "img"    ,"data-lazy-src"  ,0 },
      { "img"    ,"data-original"  ,0 },
      { "img"    ,"data-full"  ,0 },
      { "img"    ,"data-large"  ,0 },
      { "img"    ,"data-thumb"  ,0 },
      { "img"    ,"srcset"  ,0 },
      { "img"    ,"data-srcset"  ,0 },
      { "picture","srcset"  ,0 },
      { "picture","data-srcset"  ,0 },
      { "video"  ,"src"  ,0 },
      { "video"  ,"data-src"  ,0 },
      { "video"  ,"poster"  ,0 },
      { "video"  ,"data-poster"  ,0 },
      { "video"  ,"data-video"  ,0 },
      { "audio"  ,"src"  ,0 },
      { "audio"  ,"data-src"  ,0 },
      { "a"      ,"download"  ,0 },
      { "source" ,"src"  ,0 },
      { "source" ,"data-src"  ,0 },
      { "source" ,"srcset"  ,0 },
      { "source" ,"data-srcset"  ,0 },
      { "track"  ,"src"  ,0 },
      { "embed"  ,"src"  ,0 },
      { "object" ,"data" ,0 },
      { "link"   ,"href" ,0 },
      { "link"   ,"imagesrcset" ,0 },
      { "input"  ,"src" ,0 },
      { "button" ,"formaction" ,0 },
      { "div"    ,"data-bg" ,0 },
      { "div"    ,"data-background" ,0 },
      { "div"    ,"data-background-image" ,0 },
      { "div"    ,"data-src" ,0 },
      { "div"    ,"data-url" ,0 },
      { "span"   ,"data-src" ,0 },
      { "span"   ,"data-url" ,0 },
      { "li"     ,"data-src" ,0 },
      { "li"     ,"data-url" ,0 },
      { "section","data-bg" ,0 },
      { "section","data-background" ,0 },

      { ""       ,""     ,-1 }
    };

const char *PlainTextExtension[]=       {
                                            ".txt",".c",
                                            ".cpp",".bas",
                                            ".pas",".h",".xml",
                                            "\0"
                                        };


const char *HtmlExtensions[]=           {
											".htm",".html",
                                            ".php", ".asp",
                                            ".cgi",	".mspx",
                                            ".aspx",".shtml",
                                            ".pl",".phtml",
                                            ".cfm",".ch2",
                                            ".jsp",".msnw",
                                            ".php3",".xml",".rss",".atom",
                                            ".json",".jsonld",".webmanifest",
                                            ".js",".mjs",".css","\0"
                                         };

char CustomExtensions[MAXCUSTOMEXTENSIONS][MAXCUSTOMEXTENSIONSIZE];

#ifdef USE_REGEX
regex_t regexPageFilter;
regex_t regexContentFilter;
#endif

int iLastPing[MAXMUTEX];

char lstRobotsExclusions[MAXDISALLOW][MAXPAGESIZE];
int iRobCrawlDelay=0;	//Crawl delay get by robots.txt
int iCrawlDelay=0;		//crawl delay get by program argumets
int iServerBackoffCrawlDelay=0;	//temporary crawl delay after 429/503/504
DWORD gServerBackoffUntilMS=0;

int bRobotsOK;

unsigned long long gProfileDiscoveredTotal = 0;
unsigned long long gProfileAcceptedTotal = 0;
unsigned long long gProfileAssetCandidates = 0;
unsigned long long gProfileDynamicCandidates = 0;
unsigned long long gProfileCrossHostDiscovered = 0;
unsigned long long gProfileAssetDepthBypass = 0;
unsigned long long gProfileExternalAssetsQueued = 0;
unsigned long long gProfileRawAssetCandidates = 0;
unsigned long long gProfileCandidateRejectedEmpty = 0;
unsigned long long gProfileCandidateRejectedNotAsset = 0;
unsigned long long gProfileCandidateRejectedParse = 0;
unsigned long long gProfileCandidateRejectedToken = 0;
unsigned long long gProfileCandidateRejectedType = 0;
unsigned long long gProfileRejectedRobots = 0;
unsigned long long gProfileRejectedLimits = 0;
unsigned long long gProfileRejectedDepth = 0;
unsigned long long gProfileRejectedSwitch = 0;
unsigned long long gProfileRejectedDuplicate = 0;
unsigned long long gProfileQueuedTypeHtml = 0;
unsigned long long gProfileQueuedTypePlain = 0;
unsigned long long gProfileQueuedTypeAsset = 0;
unsigned long long gProfileQueuedTypeOther = 0;
unsigned long long gProfileSourceTag = 0;
unsigned long long gProfileSourceSrcset = 0;
unsigned long long gProfileSourceLoose = 0;
unsigned long long gProfileSourceBare = 0;
unsigned long long gProfileSourceHeader = 0;
unsigned long long gProfileSourceSitemap = 0;
unsigned long long gProfileSourceSeed = 0;
unsigned long long gProfileHttpOk = 0;
unsigned long long gProfileHttpRedirect = 0;
unsigned long long gProfileHttpError = 0;
unsigned long long gProfileMimeHtml = 0;
unsigned long long gProfileMimeXml = 0;
unsigned long long gProfileMimeJson = 0;
unsigned long long gProfileMimeCssJs = 0;
unsigned long long gProfileMimePdf = 0;
unsigned long long gProfileMimeImage = 0;
unsigned long long gProfileMimeVideo = 0;
unsigned long long gProfileMimeAudio = 0;
unsigned long long gProfileMimeOther = 0;
unsigned long long gProfileQualityWarnings = 0;


/*module handler*/
void* modHandler;

#define MAXLOADEDMODULES 16

typedef struct sLoadedModule
{
    char filename[MAXPAGESIZE];
    void* dlHandle;
    void* handler;
    void* initHandler;
} LOADEDMODULE;

LOADEDMODULE loadedModules[MAXLOADEDMODULES];
unsigned int loadedModuleCount = 0;

int myLoadModules(char* filename,void* handler);
int myUnloadModule(void* handler);
void* myGetProcAddress(void* handler,char* funct);
void* GetModFunctionHandlerByName(char* functName);
void* GetInitModFunctionHandlerByName(char* functName);
unsigned int GetModFunctionHandlerCountByName(char* functName);
void* GetModFunctionHandlerByNameAt(char* functName,unsigned int index);
char* GetLoadedModuleFilenameByIndex(unsigned int index);
unsigned int GetInitModFunctionHandlerCountByName(char* functName);
void* GetInitModFunctionHandlerByNameAt(char* functName,unsigned int index);


/* http://www1.tip.nl/~t876506/utf8tbl.html */
struct
{
    char* htmlChar;
	char* rep;
	int type;
	/*
	type: 0 UTF8
	      1 ASCII
	*/
} ahList[] = 
   {
      { "nbsp",  " "        , ASCII_ENCODING },
      { "amp",   "&"        , ASCII_ENCODING },
      { "euro",  "0xE282AC" , UTF8_ENCODING },
      { "cent",  "0xC2A2"   , UTF8_ENCODING },
      { "copy",  "0xC2A9"   , UTF8_ENCODING },
      { "trade", "0xE284A2" , UTF8_ENCODING },
      

/* if you have problems with these lines please contact me */
      { "Aacute","0xC381"   , UTF8_ENCODING },
      { "aacute","0xC3A1"   , UTF8_ENCODING },
      { "Eacute","0xC389"   , UTF8_ENCODING },
      { "eacute","0xC3A9"   , UTF8_ENCODING },
      { "Iacute","0xC38D"   , UTF8_ENCODING },
      { "iacute","0xC3AD"   , UTF8_ENCODING },
      { "Oacute","0xC393"   , UTF8_ENCODING },
      { "oacute","0xC3B3"   , UTF8_ENCODING },
      { "Uacute","0xC39A"   , UTF8_ENCODING },
      { "uacute","0xC3BA"   , UTF8_ENCODING },
      { "Agrave","0xC380"   , UTF8_ENCODING },
      { "agrave","0xC3A0"   , UTF8_ENCODING },
      { "Egrave","0xC388"   , UTF8_ENCODING },
      { "egrave","0xC3A8"   , UTF8_ENCODING },
      { "Igrave","0xC38C"   , UTF8_ENCODING },
      { "igrave","0xC3AC"   , UTF8_ENCODING },
      { "Ograve","0xC392"   , UTF8_ENCODING },
      { "ograve","0xC3B2"   , UTF8_ENCODING },
      { "Ugrave","0xC399"   , UTF8_ENCODING },
      { "ugrave","0xC3B9"   , UTF8_ENCODING },
      { "Acirc", "0xC382"   , UTF8_ENCODING },
      { "acirc", "0xC3A2"   , UTF8_ENCODING },
      { "Ecirc", "0xC38A"   , UTF8_ENCODING },
      { "ecirc", "0xC3AA"   , UTF8_ENCODING },
      { "Icirc", "0xC38E"   , UTF8_ENCODING },
      { "icirc", "0xC3AE"   , UTF8_ENCODING },
      { "Ocirc", "0xC394"   , UTF8_ENCODING },
      { "ocirc", "0xC3B4"   , UTF8_ENCODING },
      { "Ucirc", "0xC39B"   , UTF8_ENCODING },
      { "ucirc", "0xC3BB"   , UTF8_ENCODING },
      { "Auml",  "0xC384"   , UTF8_ENCODING },
      { "auml",  "0xC5A0"   , UTF8_ENCODING },
      { "Euml",  "0xC38B"   , UTF8_ENCODING },
      { "euml",  "0xC3AB"   , UTF8_ENCODING },
      { "Iuml",  "0xC38F"   , UTF8_ENCODING },
      { "iuml",  "0xC3AF"   , UTF8_ENCODING },
      { "Ouml",  "0xC396"   , UTF8_ENCODING },
      { "ouml",  "0xC3B6"   , UTF8_ENCODING },
      { "Uuml",  "0xC39C"   , UTF8_ENCODING },
      { "uuml",  "0xC3BC"   , UTF8_ENCODING },
      { "Aring", "0xC385"   , UTF8_ENCODING },
      { "aring", "0xC3A5"   , UTF8_ENCODING },
      { "AElig", "0xC386"   , UTF8_ENCODING },
      { "aelig", "0xC3A6"   , UTF8_ENCODING },
      { "Ccedil", "0xC387"  , UTF8_ENCODING },
      { "ccedil", "0xC3A7"  , UTF8_ENCODING },

/*      { "",'' },   */
      { NULL,     0  }
   };

typedef struct functArg
{
	struct sHost* hostInfo;
	char* html;
	unsigned int htmlLength;
	char* text;
	unsigned int textLength;

	int PagesViewed;
	OWS_DOWNLOAD_SIZE bytesDownloaded;

	void* mysqlDB1;
	void* mysqlDB2;
}FUNCTION_ARGUMENT;



#endif


/*EOF*/
