/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.8
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __THREAD
#define __THREAD

extern SSL_CTX *g_ssl_ctx;

#include <stdint.h>

#ifndef OWS_MAX_REDIRECT_CHAIN
#define OWS_MAX_REDIRECT_CHAIN 8
#endif

static int OwsEndsWithCi(const char *s, const char *suffix)
{
    size_t sl, xl, i;
    if (!s || !suffix) return 0;
    sl = strlen(s);
    xl = strlen(suffix);
    if (sl < xl) return 0;
    for (i = 0; i < xl; i++)
    {
        if (tolower((unsigned char)s[sl - xl + i]) != tolower((unsigned char)suffix[i]))
            return 0;
    }
    return 1;
}

static int OwsHasMediaSuffix(const char *page)
{
    static const char *exts[] = {
        ".mp4",".webm",".ogv",".mov",".avi",".mkv",".m4v",".flv",".wmv",".asf",
        ".ts",".m2ts",".mpeg",".mpg",".mpe",".vob",".3gp",".3g2",
        ".mp3",".m4a",".m4b",".aac",".adts",".oga",".ogg",".wav",".flac",".wma",".opus",".weba",
        ".aif",".aiff",".mid",".midi",
        ".m3u8",".mpd",".vtt",".srt","\0"
    };
    char pageOnly[MAXPAGESIZE];
    const char *qmark;
    size_t len;
    int i;

    if (!page || !page[0]) return 0;
    qmark = strchr(page, '?');
    {
        const char *hash = strchr(page, '#');
        if (qmark && hash)
            len = (qmark < hash) ? (size_t)(qmark - page) : (size_t)(hash - page);
        else if (qmark)
            len = (size_t)(qmark - page);
        else if (hash)
            len = (size_t)(hash - page);
        else
            len = strlen(page);
    }
    if (len >= sizeof(pageOnly)) len = sizeof(pageOnly) - 1;
    memcpy(pageOnly, page, len);
    pageOnly[len] = '\0';

    for (i = 0; exts[i][0] != '\0'; i++)
        if (OwsEndsWithCi(pageOnly, exts[i]))
            return 1;

    return 0;
}

static int OwsHasPdfSuffix(const char *page)
{
    char pageOnly[MAXPAGESIZE];
    const char *qmark;
    const char *hash;
    size_t len;

    if (!page || !page[0]) return 0;
    qmark = strchr(page, '?');
    hash = strchr(page, '#');
    if (qmark && hash)
        len = (qmark < hash) ? (size_t)(qmark - page) : (size_t)(hash - page);
    else if (qmark)
        len = (size_t)(qmark - page);
    else if (hash)
        len = (size_t)(hash - page);
    else
        len = strlen(page);
    if (len >= sizeof(pageOnly)) len = sizeof(pageOnly) - 1;
    memcpy(pageOnly, page, len);
    pageOnly[len] = '\0';
    return OwsEndsWithCi(pageOnly, ".pdf");
}

static int OwsHasCustomAssetSuffix(const char *page)
{
    char pageOnly[MAXPAGESIZE];
    const char *qmark;
    const char *hash;
    size_t len;
    int i;

    if (!page || !page[0]) return 0;
    qmark = strchr(page, '?');
    hash = strchr(page, '#');
    if (qmark && hash)
        len = (qmark < hash) ? (size_t)(qmark - page) : (size_t)(hash - page);
    else if (qmark)
        len = (size_t)(qmark - page);
    else if (hash)
        len = (size_t)(hash - page);
    else
        len = strlen(page);
    if (len >= sizeof(pageOnly)) len = sizeof(pageOnly) - 1;
    memcpy(pageOnly, page, len);
    pageOnly[len] = '\0';

    for (i = 0; CustomExtensions[i][0] != '\0'; i++)
        if (OwsEndsWithCi(pageOnly, CustomExtensions[i]))
            return 1;

    return 0;
}

static int OwsBodyContainsPdf(const char *body, unsigned int bodyLen)
{
    unsigned int i;

    if (!body || bodyLen < 16)
        return 0;
    if (bodyLen >= 5 && memcmp(body, "%PDF-", 5) == 0)
        return 1;
    for (i = 0; i + 5 < bodyLen; i++)
    {
        if (memcmp(body + i, "%PDF-", 5) == 0)
            return 1;
    }
    return 0;
}

static int OwsMemContainsCi(const char *body, unsigned int bodyLen, const char *needle)
{
    unsigned int i, j;
    unsigned int nlen;

    if (!body || !needle)
        return 0;
    nlen = (unsigned int)strlen(needle);
    if (nlen == 0 || bodyLen < nlen)
        return 0;
    for (i = 0; i <= bodyLen - nlen; i++)
    {
        for (j = 0; j < nlen; j++)
        {
            if (tolower((unsigned char)body[i + j]) != tolower((unsigned char)needle[j]))
                break;
        }
        if (j == nlen)
            return 1;
    }
    return 0;
}

static int OwsStrContainsCi(const char *hay, const char *needle)
{
    if (!hay)
        return 0;
    return OwsMemContainsCi(hay, (unsigned int)strlen(hay), needle);
}

static int OwsLooksLikeBareRedirectParam(const char *location)
{
    const char *p;
    int hasEq = 0;

    if (!location || !location[0])
        return 0;
    if (strstr(location, "://") || strncmp(location, "//", 2) == 0)
        return 0;
    if (location[0] == '/' || location[0] == '?' || location[0] == '&' || location[0] == '#')
        return 0;

    for (p = location; *p; p++)
    {
        unsigned char c = (unsigned char)*p;
        if (c == '=')
        {
            hasEq = 1;
            continue;
        }
        if (c == '&')
            continue;
        if (isalnum(c) || c == '_' || c == '-' || c == '.')
            continue;
        return 0;
    }

    return hasEq;
}

static int OwsNormalizeRedirectLocation(const struct sHost *currentHost, const char *rawLocation, char *out, size_t outSize)
{
    const char *queryPos;

    if (!out || outSize < 2 || !rawLocation || rawLocation[0] == '\0')
        return 0;

    out[0] = '\0';

    if (!OwsLooksLikeBareRedirectParam(rawLocation))
    {
        strncpy(out, rawLocation, outSize - 1);
        out[outSize - 1] = '\0';
        return 1;
    }

    if (!currentHost || currentHost->Page[0] == '\0')
    {
        strncpy(out, rawLocation, outSize - 1);
        out[outSize - 1] = '\0';
        return 1;
    }

    if (strstr(currentHost->Page, rawLocation) != NULL)
        return 0;

    queryPos = strchr(currentHost->Page, '?');
    if (queryPos)
        snprintf(out, outSize, "%s&%s", currentHost->Page, rawLocation);
    else
        snprintf(out, outSize, "%s?%s", currentHost->Page, rawLocation);

    out[outSize - 1] = '\0';
    return 1;
}

static int OwsSameHostPage(const struct sHost *a, const struct sHost *b)
{
    if (!a || !b)
        return 0;
    return (stricmp(a->Host, b->Host) == 0 &&
            a->port == b->port &&
            a->isSSL == b->isSSL &&
            strcmp(a->Page, b->Page) == 0);
}

static int OwsIsPdfResponse(const struct sHost *host, const char *packet, const char *body, unsigned int bodyLen)
{
    char *ct;

    if (packet)
    {
        ct = my_stristr((char*)packet, "Content-Type:");
        if (ct && my_stristr(ct, "application/pdf"))
            return 1;
        if (my_stristr((char*)packet, "Content-Disposition:") &&
            my_stristr((char*)packet, ".pdf") &&
            OwsBodyContainsPdf(body, bodyLen))
            return 1;
    }
    if (host && OwsStrContainsCi(host->HttpContentType, "application/pdf"))
        return 1;
    if (host && OwsStrContainsCi(host->HttpContentDisposition, ".pdf"))
        return OwsBodyContainsPdf(body, bodyLen);

    if (body && bodyLen >= 5 && memcmp(body, "%PDF-", 5) == 0)
        return 1;
    if (host && OwsHasPdfSuffix(host->Page) && OwsBodyContainsPdf(body, bodyLen))
        return 1;

    return 0;
}

static int OwsIsImageResponse(const struct sHost *host, const char *packet, const char *body, unsigned int bodyLen)
{
    char *ct;

    if (host && OwsStrContainsCi(host->HttpContentType, "image/"))
        return 1;

    if (packet)
    {
        ct = my_stristr((char*)packet, "Content-Type:");
        if (ct && my_stristr(ct, "image/"))
            return 1;
    }

    if (!body || bodyLen < 4)
        return 0;

    if (bodyLen >= 8 && memcmp(body, "\x89PNG\r\n\x1a\n", 8) == 0)
        return 1;
    if (bodyLen >= 12 && memcmp(body, "\xff\x4f\xff\x51", 4) == 0)
        return 1;
    if (bodyLen >= 12 && memcmp(body, "\x00\x00\x00\x0cjP  \r\n\x87\n", 12) == 0)
        return 1;
    if (bodyLen >= 2 && (unsigned char)body[0] == 0xff && (unsigned char)body[1] == 0x0a)
        return 1;
    if (bodyLen >= 3 && (unsigned char)body[0] == 0xFF && (unsigned char)body[1] == 0xD8 && (unsigned char)body[2] == 0xFF)
        return 1;
    if (bodyLen >= 6 && (memcmp(body, "GIF87a", 6) == 0 || memcmp(body, "GIF89a", 6) == 0))
        return 1;
    if (bodyLen >= 12 && memcmp(body, "RIFF", 4) == 0 && memcmp(body + 8, "WEBP", 4) == 0)
        return 1;
    if (bodyLen >= 4 && (memcmp(body, "II*\0", 4) == 0 || memcmp(body, "MM\0*", 4) == 0))
        return 1;
    if (bodyLen >= 2 && memcmp(body, "BM", 2) == 0)
        return 1;
    if (bodyLen >= 8 && body[0] == 0 && body[1] == 0 && body[2] == 1 && body[3] == 0)
        return 1;
    if (bodyLen >= 12 && memcmp(body + 4, "ftyp", 4) == 0 &&
        (memcmp(body + 8, "avif", 4) == 0 || memcmp(body + 8, "avis", 4) == 0 ||
         memcmp(body + 8, "heic", 4) == 0 || memcmp(body + 8, "heix", 4) == 0 ||
         memcmp(body + 8, "hevc", 4) == 0 || memcmp(body + 8, "hevx", 4) == 0))
        return 1;
    if (bodyLen >= 4 && memcmp(body, "<svg", 4) == 0)
        return 1;
    if (bodyLen >= 5 && memcmp(body, "<?xml", 5) == 0 && OwsMemContainsCi(body, bodyLen, "<svg"))
        return 1;

    return 0;
}

static int OwsIsVideoResponse(const struct sHost *host, const char *packet, const char *body, unsigned int bodyLen)
{
    char *ct;

    if (packet)
    {
        ct = my_stristr((char*)packet, "Content-Type:");
        if ((host && (OwsStrContainsCi(host->HttpContentType, "video/") ||
                      OwsStrContainsCi(host->HttpContentType, "application/vnd.apple.mpegurl") ||
                      OwsStrContainsCi(host->HttpContentType, "application/x-mpegurl") ||
                      OwsStrContainsCi(host->HttpContentType, "application/dash+xml"))) ||
            (ct && (my_stristr(ct, "video/") ||
                    my_stristr(ct, "application/vnd.apple.mpegurl") ||
                    my_stristr(ct, "application/x-mpegurl") ||
                    my_stristr(ct, "application/dash+xml"))))
            return 1;
        if (((host && OwsStrContainsCi(host->HttpContentType, "application/octet-stream")) ||
             (ct && my_stristr(ct, "application/octet-stream"))) && host && OwsHasMediaSuffix(host->Page))
            return 1;
    }
    else if (host && (OwsStrContainsCi(host->HttpContentType, "video/") ||
                      OwsStrContainsCi(host->HttpContentType, "application/vnd.apple.mpegurl") ||
                      OwsStrContainsCi(host->HttpContentType, "application/x-mpegurl") ||
                      OwsStrContainsCi(host->HttpContentType, "application/dash+xml")))
        return 1;

    if (host && OwsHasMediaSuffix(host->Page))
        return 1;

    if (!body || bodyLen < 4)
        return 0;

    if (bodyLen >= 7 && memcmp(body, "#EXTM3U", 7) == 0)
        return 1;
    if (bodyLen >= 32 && OwsMemContainsCi(body, bodyLen, "<MPD"))
        return 1;
    if (bodyLen >= 12 && memcmp(body + 4, "ftyp", 4) == 0 &&
        !(memcmp(body + 8, "M4A ", 4) == 0 || memcmp(body + 8, "M4B ", 4) == 0 ||
          memcmp(body + 8, "avif", 4) == 0 || memcmp(body + 8, "avis", 4) == 0 ||
          memcmp(body + 8, "heic", 4) == 0 || memcmp(body + 8, "heix", 4) == 0 ||
          memcmp(body + 8, "hevc", 4) == 0 || memcmp(body + 8, "hevx", 4) == 0))
        return 1;
    if (bodyLen >= 12 && memcmp(body, "RIFF", 4) == 0 && memcmp(body + 8, "AVI ", 4) == 0)
        return 1;
    if (bodyLen >= 4 && memcmp(body, "\x1a\x45\xdf\xa3", 4) == 0)
        return 1;
    if (bodyLen >= 4 && memcmp(body, "FLV", 3) == 0)
        return 1;
    if (bodyLen >= 16 && memcmp(body, "\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c", 16) == 0)
        return 1;
    if (bodyLen >= 4 && memcmp(body, "\x00\x00\x01\xba", 4) == 0)
        return 1;
    if (bodyLen >= 189 && (unsigned char)body[0] == 0x47 && (unsigned char)body[188] == 0x47)
        return 1;

    return 0;
}

static int OwsIsAudioResponse(const struct sHost *host, const char *packet, const char *body, unsigned int bodyLen)
{
    char *ct;

    if (packet)
    {
        ct = my_stristr((char*)packet, "Content-Type:");
        if ((host && OwsStrContainsCi(host->HttpContentType, "audio/")) ||
            (ct && my_stristr(ct, "audio/")))
            return 1;
        if (((host && OwsStrContainsCi(host->HttpContentType, "application/octet-stream")) ||
             (ct && my_stristr(ct, "application/octet-stream"))) && host && OwsHasMediaSuffix(host->Page))
            return 1;
    }
    else if (host && OwsStrContainsCi(host->HttpContentType, "audio/"))
        return 1;

    if (host && OwsHasMediaSuffix(host->Page))
        return 1;

    if (!body || bodyLen < 4)
        return 0;

    if (bodyLen >= 3 && memcmp(body, "ID3", 3) == 0)
        return 1;
    if (bodyLen >= 2 && (unsigned char)body[0] == 0xff &&
        (((unsigned char)body[1] & 0xf0) == 0xf0 || ((unsigned char)body[1] & 0xe0) == 0xe0))
        return 1;
    if (bodyLen >= 12 && memcmp(body + 4, "ftyp", 4) == 0 &&
        (memcmp(body + 8, "M4A ", 4) == 0 || memcmp(body + 8, "M4B ", 4) == 0))
        return 1;
    if (bodyLen >= 12 && memcmp(body, "RIFF", 4) == 0 && memcmp(body + 8, "WAVE", 4) == 0)
        return 1;
    if (bodyLen >= 12 && memcmp(body, "FORM", 4) == 0 && (memcmp(body + 8, "AIFF", 4) == 0 || memcmp(body + 8, "AIFC", 4) == 0))
        return 1;
    if (bodyLen >= 4 && memcmp(body, "fLaC", 4) == 0)
        return 1;
    if (bodyLen >= 4 && memcmp(body, "OggS", 4) == 0)
        return 1;
    if (bodyLen >= 4 && memcmp(body, "MThd", 4) == 0)
        return 1;

    return 0;
}

static int OwsIsAggressiveGenericAssetResponse(const struct sHost *host, const char *packet, const char *body, unsigned int bodyLen)
{
    char *ct = NULL;
    char *cd = NULL;
    const char *hostCt = host ? host->HttpContentType : "";
    const char *hostCd = host ? host->HttpContentDisposition : "";

    if (!bAggressiveIndexMode)
        return 0;

    if (packet)
    {
        ct = my_stristr((char*)packet, "Content-Type:");
        cd = my_stristr((char*)packet, "Content-Disposition:");
    }

    if (host && OwsHasCustomAssetSuffix(host->Page))
        return 1;

    if (OwsStrContainsCi(hostCd, "attachment") ||
        OwsStrContainsCi(hostCd, "filename=") ||
        (cd && (my_stristr(cd, "attachment") || my_stristr(cd, "filename="))))
        return 1;

    if (OwsStrContainsCi(hostCt, "application/octet-stream") ||
        OwsStrContainsCi(hostCt, "binary/octet-stream") ||
        OwsStrContainsCi(hostCt, "application/force-download") ||
        OwsStrContainsCi(hostCt, "application/download") ||
        (ct && (my_stristr(ct, "application/octet-stream") ||
                my_stristr(ct, "binary/octet-stream") ||
                my_stristr(ct, "application/force-download") ||
                my_stristr(ct, "application/download"))))
        return 1;

    if (OwsStrContainsCi(hostCt, "font/") ||
        OwsStrContainsCi(hostCt, "application/font") ||
        OwsStrContainsCi(hostCt, "application/vnd.ms-fontobject") ||
        OwsStrContainsCi(hostCt, "application/zip") ||
        OwsStrContainsCi(hostCt, "application/x-zip") ||
        OwsStrContainsCi(hostCt, "application/x-7z") ||
        OwsStrContainsCi(hostCt, "application/x-rar") ||
        OwsStrContainsCi(hostCt, "application/gzip") ||
        OwsStrContainsCi(hostCt, "application/x-tar") ||
        OwsStrContainsCi(hostCt, "application/vnd.") ||
        OwsStrContainsCi(hostCt, "application/msword") ||
        OwsStrContainsCi(hostCt, "application/epub") ||
        OwsStrContainsCi(hostCt, "application/x-mobipocket") ||
        OwsStrContainsCi(hostCt, "model/"))
        return 1;

    if (ct && (my_stristr(ct, "font/") ||
               my_stristr(ct, "application/font") ||
               my_stristr(ct, "application/vnd.ms-fontobject") ||
               my_stristr(ct, "application/zip") ||
               my_stristr(ct, "application/x-zip") ||
               my_stristr(ct, "application/x-7z") ||
               my_stristr(ct, "application/x-rar") ||
               my_stristr(ct, "application/gzip") ||
               my_stristr(ct, "application/x-tar") ||
               my_stristr(ct, "application/vnd.") ||
               my_stristr(ct, "application/msword") ||
               my_stristr(ct, "application/epub") ||
               my_stristr(ct, "application/x-mobipocket") ||
               my_stristr(ct, "model/")))
        return 1;

    if (body && bodyLen >= 4)
    {
        if (memcmp(body, "PK\003\004", 4) == 0 || memcmp(body, "PK\005\006", 4) == 0 ||
            memcmp(body, "Rar!", 4) == 0 || memcmp(body, "7z\xBC\xAF", 4) == 0 ||
            memcmp(body, "\x1F\x8B", 2) == 0 || memcmp(body, "BZh", 3) == 0 ||
            memcmp(body, "\xFD" "7zXZ", 5) == 0)
            return 1;
        if (bodyLen >= 8 && memcmp(body, "wOFF", 4) == 0)
            return 1;
        if (bodyLen >= 8 && memcmp(body, "wOF2", 4) == 0)
            return 1;
        if (bodyLen >= 4 && memcmp(body, "\x00\x01\x00\x00", 4) == 0)
            return 1;
        if (bodyLen >= 4 && memcmp(body, "OTTO", 4) == 0)
            return 1;
    }

    return 0;
}

static void OwsProfileRecordMime(const char *ct)
{
    if (!ct || !ct[0])
    {
        gProfileMimeOther++;
        return;
    }
    if (OwsStrContainsCi(ct, "text/html") || OwsStrContainsCi(ct, "application/xhtml"))
        gProfileMimeHtml++;
    else if (OwsStrContainsCi(ct, "xml") || OwsStrContainsCi(ct, "rss") || OwsStrContainsCi(ct, "atom"))
        gProfileMimeXml++;
    else if (OwsStrContainsCi(ct, "json") || OwsStrContainsCi(ct, "manifest"))
        gProfileMimeJson++;
    else if (OwsStrContainsCi(ct, "javascript") || OwsStrContainsCi(ct, "ecmascript") || OwsStrContainsCi(ct, "text/css"))
        gProfileMimeCssJs++;
    else if (OwsStrContainsCi(ct, "application/pdf"))
        gProfileMimePdf++;
    else if (OwsStrContainsCi(ct, "image/"))
        gProfileMimeImage++;
    else if (OwsStrContainsCi(ct, "video/") || OwsStrContainsCi(ct, "mpegurl") || OwsStrContainsCi(ct, "dash+xml"))
        gProfileMimeVideo++;
    else if (OwsStrContainsCi(ct, "audio/"))
        gProfileMimeAudio++;
    else
        gProfileMimeOther++;
}

static void OwsProfileRecordHttpFetch(const struct sHost *host, const char *httpStatus, int parseResult, unsigned int bytes)
{
    char profileMsg[1024];
    char jsonMsg[1536];
    char jHost[160], jPage[360], jStatus[120], jMime[180], jDisposition[220];
    int statusClass = 0;

    if (httpStatus && strlen(httpStatus) >= 10)
        statusClass = httpStatus[9] - '0';

    if (parseResult == 2)
        gProfileHttpOk++;
    else if (statusClass == 3)
        gProfileHttpRedirect++;
    else if (statusClass == 4 || statusClass == 5 || parseResult == 0)
        gProfileHttpError++;

    if (host)
        OwsProfileRecordMime(host->HttpContentType);

    snprintf(profileMsg,sizeof(profileMsg),
             "fetch host=%.100s page=%.255s status=\"%.80s\" parse=%d bytes=%u mime=\"%.120s\" disposition=\"%.120s\"",
             host ? host->Host : "",
             host ? host->Page : "",
             httpStatus ? httpStatus : "",
             parseResult,
             bytes,
             host ? host->HttpContentType : "",
             host ? host->HttpContentDisposition : "");
    CRAWLER_PROFILE_LOG(profileMsg);
    OwsJsonEscapeCopy(host ? host->Host : "", jHost, sizeof(jHost));
    OwsJsonEscapeCopy(host ? host->Page : "", jPage, sizeof(jPage));
    OwsJsonEscapeCopy(httpStatus ? httpStatus : "", jStatus, sizeof(jStatus));
    OwsJsonEscapeCopy(host ? host->HttpContentType : "", jMime, sizeof(jMime));
    OwsJsonEscapeCopy(host ? host->HttpContentDisposition : "", jDisposition, sizeof(jDisposition));
    snprintf(jsonMsg,sizeof(jsonMsg),
             "{\"event\":\"fetch\",\"host\":\"%.100s\",\"page\":\"%.255s\",\"status\":\"%.80s\",\"parse\":%d,\"bytes\":%u,\"mime\":\"%.120s\",\"disposition\":\"%.120s\"}",
             jHost,
             jPage,
             jStatus,
             parseResult,
             bytes,
             jMime,
             jDisposition);
    CRAWLER_PROFILE_JSONL(jsonMsg);
}

static int OwsHeaderUrlLooksAsset(const char *url)
{
    static const char *needles[] = {
        ".pdf",".jpg",".jpeg",".png",".gif",".webp",".svg",".avif",".heic",".heif",
        ".mp4",".webm",".m3u8",".mpd",".mp3",".m4a",".aac",".ogg",".opus",".wav",".flac",
        "image","video","audio","media","asset","download","pdf","manifest","\0"
    };
    int i;
	if (!url || !url[0])
	{
		return 0;
	}
	for (i = 0; needles[i][0] != '\0'; i++)
		if (OwsStrContainsCi(url, needles[i]))
			return 1;
	if (bAggressiveIndexMode &&
	    (OwsStrContainsCi(url, "http://") || OwsStrContainsCi(url, "https://") ||
	     OwsStrContainsCi(url, "//") || OwsStrContainsCi(url, "/")) &&
	    !OwsStrContainsCi(url, "javascript:") &&
	    !OwsStrContainsCi(url, "mailto:") &&
	    !OwsStrContainsCi(url, "tel:"))
		return 1;
	return 0;
}

static void OwsTrimHeaderValue(char *s);

static int OwsQueueHttpLinkAssets(const char *packet, struct sHost *currentHost)
{
    static const char *directHeaders[] = {
        "Content-Location:",
        "X-Accel-Redirect:",
        "X-Sendfile:",
        "X-Archive-Orig-Location:",
        "X-Original-URL:",
        "X-Reproxy-URL:",
        "Refresh:",
        "\0"
    };
    const char *p;
    int queued = 0;

    if (!packet || !currentHost)
        return 0;

    p = packet;
    while (*p)
    {
        const char *lineEnd;
        const char *cur;
        size_t lineLen;

        if ((p[0] == '\r' && p[1] == '\n') || p[0] == '\n')
            break;
        lineEnd = strstr(p, "\r\n");
        if (!lineEnd)
            lineEnd = strchr(p, '\n');
        if (!lineEnd)
            lineEnd = p + strlen(p);
        lineLen = (size_t)(lineEnd - p);

        if (lineLen >= 5 && strnicmp((char*)p, "Link:", 5) == 0)
        {
            cur = p + 5;
            while (cur < lineEnd)
            {
                const char *lt = memchr(cur, '<', (size_t)(lineEnd - cur));
                const char *gt;
                char raw[MAXURLSIZE];
                size_t len;
                struct sHost linkHost;

                if (!lt)
                    break;
                gt = memchr(lt, '>', (size_t)(lineEnd - lt));
                if (!gt)
                    break;
                len = (size_t)(gt - lt - 1);
                if (len > 0 && len < sizeof(raw))
                {
                    memcpy(raw, lt + 1, len);
                    raw[len] = '\0';
                    if (OwsHeaderUrlLooksAsset(raw) && ParseUrl(raw, &linkHost, currentHost) != -1)
                    {
                        if (AddUrl(linkHost, currentHost->level, currentHost) == 1)
                        {
                            char profileMsg[1024];
                            queued++;
                            gProfileSourceHeader++;
                            snprintf(profileMsg,sizeof(profileMsg),
                                     "discover source=http_link_header host=%.100s page=%.255s from=%.100s",
                                     linkHost.Host,linkHost.Page,currentHost->Host);
                            CRAWLER_PROFILE_LOG(profileMsg);
                        }
                    }
                }
                cur = gt + 1;
            }
            p = lineEnd + ((lineEnd[0] == '\r' && lineEnd[1] == '\n') ? 2 : (lineEnd[0] ? 1 : 0));
            continue;
        }
        else
        {
            int h;
            for(h=0; directHeaders[h][0] != '\0'; h++)
            {
                size_t headerLen = strlen(directHeaders[h]);
                if(lineLen > headerLen && strnicmp((char*)p, (char*)directHeaders[h], headerLen) == 0)
                {
                    char raw[MAXURLSIZE];
                    size_t len = lineLen - headerLen;
                    struct sHost linkHost;

                    if(len >= sizeof(raw))
                        len = sizeof(raw)-1;
                    memcpy(raw, p + headerLen, len);
                    raw[len] = '\0';
                    OwsTrimHeaderValue(raw);
                    if(strnicmp(raw, "0;", 2) == 0 || strnicmp(raw, "1;", 2) == 0)
                    {
                        char *urlPart = my_stristr(raw, "url=");
                        if(urlPart)
                            memmove(raw, urlPart + 4, strlen(urlPart + 4) + 1);
                        OwsTrimHeaderValue(raw);
                    }
                    if(raw[0] && OwsHeaderUrlLooksAsset(raw) && ParseUrl(raw, &linkHost, currentHost) != -1)
                    {
                        if(AddUrl(linkHost, currentHost->level, currentHost) == 1)
                        {
                            char profileMsg[1024];
                            queued++;
                            gProfileSourceHeader++;
                            snprintf(profileMsg,sizeof(profileMsg),
                                     "discover source=http_direct_header host=%.100s page=%.255s from=%.100s",
                                     linkHost.Host,linkHost.Page,currentHost->Host);
                            CRAWLER_PROFILE_LOG(profileMsg);
                        }
                    }
                }
            }
        }
        p = lineEnd + ((lineEnd[0] == '\r' && lineEnd[1] == '\n') ? 2 : (lineEnd[0] ? 1 : 0));
    }

    return queued;
}

static void OwsTrimHeaderValue(char *s)
{
    int i, start = 0, end;
    if (!s) return;
    while (s[start] == ' ' || s[start] == '\t') start++;
    if (start > 0) memmove(s, s + start, strlen(s + start) + 1);
    end = (int)strlen(s) - 1;
    while (end >= 0 && (s[end] == ' ' || s[end] == '\t' || s[end] == '\r' || s[end] == '\n'))
        s[end--] = '\0';
    for (i = 0; s[i]; i++) {
        if ((unsigned char)s[i] < 32 && s[i] != '\t')
            s[i] = ' ';
    }
}

static int OwsExtractHeaderValue(const char *packet, const char *headerName, char *out, size_t outSize)
{
    const char *p;
    const char *lineEnd;
    size_t nameLen;
    size_t len;

    if (!packet || !headerName || !out || outSize < 2)
        return 0;

    out[0] = '\0';
    nameLen = strlen(headerName);
    if (nameLen == 0)
        return 0;

    p = packet;
    while (*p)
    {
        if ((p[0] == '\r' && p[1] == '\n') || p[0] == '\n')
            break; /* end of headers */

        if (strnicmp((char*)p, (char*)headerName, (int)nameLen) == 0)
        {
            p += nameLen;
            while (*p == ' ' || *p == '\t') p++;
            lineEnd = strstr(p, "\r\n");
            if (!lineEnd) lineEnd = strchr(p, '\n');
            if (!lineEnd) lineEnd = p + strlen(p);
            len = (size_t)(lineEnd - p);
            if (len >= outSize) len = outSize - 1;
            memcpy(out, p, len);
            out[len] = '\0';
            OwsTrimHeaderValue(out);
            return out[0] != '\0';
        }

        lineEnd = strstr(p, "\r\n");
        if (!lineEnd) lineEnd = strchr(p, '\n');
        if (!lineEnd) break;
        p = lineEnd + ((lineEnd[0] == '\r' && lineEnd[1] == '\n') ? 2 : 1);
    }

    return 0;
}

static int OwsIsThrottleStatus(const char *httpStatus)
{
    if (!httpStatus)
        return 0;
    return (strnicmp(httpStatus, "HTTP/1.1 429", 12) == 0 ||
            strnicmp(httpStatus, "HTTP/1.0 429", 12) == 0 ||
            strnicmp(httpStatus, "HTTP/1.1 503", 12) == 0 ||
            strnicmp(httpStatus, "HTTP/1.0 503", 12) == 0 ||
            strnicmp(httpStatus, "HTTP/1.1 504", 12) == 0 ||
            strnicmp(httpStatus, "HTTP/1.0 504", 12) == 0);
}

static unsigned int OwsRetryAfterMs(const char *packet, const char *httpStatus)
{
    char retryAfter[64];
    char *end = NULL;
    unsigned long seconds;

    if (packet && OwsExtractHeaderValue(packet, "Retry-After:", retryAfter, sizeof(retryAfter)))
    {
        seconds = strtoul(retryAfter, &end, 10);
        if (end != retryAfter && seconds > 0)
        {
            if (seconds > 120)
                seconds = 120;
            return (unsigned int)(seconds * 1000);
        }
    }

    if (httpStatus &&
        (strnicmp(httpStatus, "HTTP/1.1 429", 12) == 0 ||
         strnicmp(httpStatus, "HTTP/1.0 429", 12) == 0))
        return 15000;

    return 5000;
}

static void OwsApplyServerBackoff(const char *packet, const char *httpStatus)
{
    unsigned int delayMs = OwsRetryAfterMs(packet, httpStatus);
    DWORD until = GetTickCount() + delayMs;

    if (until > gServerBackoffUntilMS)
        gServerBackoffUntilMS = until;

    if (iServerBackoffCrawlDelay < 2000)
        iServerBackoffCrawlDelay = 2000;
}

static int OwsDbGetPdfValidators(struct sHost *hst, char *etagOut, size_t etagOutSize, char *lmOut, size_t lmOutSize)
{
    MYSQL_RES *res = NULL;
    MYSQL_ROW row;
    char eHost[MAXHOSTSIZE * 2 + 1];
    char ePage[MAXPAGESIZE * 2 + 1];
    char sql[1024];
    int found = 0;

    if (!hst || !etagOut || !lmOut || etagOutSize < 2 || lmOutSize < 2)
        return 0;

    etagOut[0] = '\0';
    lmOut[0] = '\0';

    thrdBlock(BLOCKINDEX);
    mysql_real_escape_string(&gMysqlDB2, eHost, hst->Host, strlen(hst->Host));
    mysql_real_escape_string(&gMysqlDB2, ePage, hst->Page, strlen(hst->Page));
    snprintf(sql, sizeof(sql),
             "SELECT d.http_etag, d.http_last_modified "
             "FROM se_pdf_map m JOIN se_pdf_doc d ON d.doc_id=m.doc_id "
             "WHERE m.hostname='%s' AND m.page='%s' LIMIT 1",
             eHost, ePage);
    if (mysql_query(&gMysqlDB2, sql) == 0)
    {
        res = mysql_store_result(&gMysqlDB2);
        if (res)
        {
            row = mysql_fetch_row(res);
            if (row)
            {
                if (row[0]) {
                    strncpy(etagOut, row[0], etagOutSize - 1);
                    etagOut[etagOutSize - 1] = '\0';
                    OwsTrimHeaderValue(etagOut);
                }
                if (row[1]) {
                    strncpy(lmOut, row[1], lmOutSize - 1);
                    lmOut[lmOutSize - 1] = '\0';
                    OwsTrimHeaderValue(lmOut);
                }
                if (etagOut[0] || lmOut[0])
                    found = 1;
            }
            mysql_free_result(res);
        }
    }
    thrdUnBlock(BLOCKINDEX);
    return found;
}

static int OwsAppendConditionalHeaders(char *requestPacket, size_t packetSize, const char *etag, const char *lastModified)
{
    char extra[512];
    char *tail;
    size_t reqLen;
    size_t extraLen = 0;

    if (!requestPacket || packetSize < 8)
        return 0;

    extra[0] = '\0';
    if (etag && etag[0] != '\0')
        extraLen += (size_t)snprintf(extra + extraLen, sizeof(extra) - extraLen, "If-None-Match: %s\r\n", etag);
    if (lastModified && lastModified[0] != '\0' && extraLen < sizeof(extra) - 1)
        extraLen += (size_t)snprintf(extra + extraLen, sizeof(extra) - extraLen, "If-Modified-Since: %s\r\n", lastModified);
    if (extraLen == 0)
        return 1;

    tail = strstr(requestPacket, "\r\n\r\n");
    if (!tail)
        return 0;

    reqLen = strlen(requestPacket);
    if (reqLen + extraLen >= packetSize)
        return 0;

    memmove(tail + extraLen, tail, strlen(tail) + 1);
    memcpy(tail, extra, extraLen);
    return 1;
}




#ifdef WIN32
  unsigned thrdML[MAXTHREAD];
  HANDLE thrdhML[MAXTHREAD];

  unsigned thrdServer;
  HANDLE thrdhServer;
#else
  pthread_t thrdML[MAXTHREAD];
  pthread_t thrdServer;
#endif

#ifdef WIN32
  unsigned __stdcall 
#else
  void* 
#endif
mainThread(LPVOID pthrdNum)
{
struct sHost currentHst;
char        *packet = NULL;
char        *html = NULL;
unsigned int packetCap = 0;
unsigned int htmlCap = 0;
char         requestPacket[4096];
DWORD        tStart=0;
SOCKET       sock;
SSL         *ssl = NULL;
int          snd;
unsigned int recvdbytes;
unsigned int htmlLength;
int          condition=1;
int thrdNum = (int)(intptr_t) pthrdNum;
char         sStdOutTmp[10000];
char         sStdOut[10000];
SOCKADDR_IN  mSaddr;
NODE*        nCur=NULL;
char         httpStatus[MAXHTTPSTATUSSIZE];
int          HttpRequestRet;
char         sLocation[MAXURLSIZE];

	while(condition)
	{
		
		UnBlockAll();
		
		if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1)
		{
			UnBlockAll();
			ExitThread(0);
		}
			
		if(iStop)
		{
			Sleep(500);
			continue;
		}

		thrdBlock(BLOCKTHRDHST);

		{
			char stopReason[160];
			int limitReached = checkLimitsReason(stopReason, sizeof(stopReason));

			if( limitReached == 1
				|| iDoNextHost==1)	//switch to the next host
			{
				if(limitReached == 1)
					printf("\r\n(%i) Thread stop: %s\r\n", thrdNum, stopReason);
				else
					printf("\r\n(%i) Thread stop: switching to next host\r\n", thrdNum);

				/* set the status of the pages to be indexed as indexed */
				lstSetNodeStatus(lstFirst,0,1);

				/*Un-block all mutexes owned by this thread (only BLOCKTHRDHST) and...*/
				UnBlockAll();

				/* exit */
				ExitThread(0);
			}
		}

	
		if((nCur=lstGetNodeByVal(lstFirst,0))!=NULL)
		{
			if(nCur==NULL || nCur->field==NULL)
			{
				thrdUnBlock(BLOCKTHRDHST);
				continue;
			}
			
			/* robots.txt checked?!? */
			if(bRobotsOK==0)
			{
				/* is this page robots.txt */
				if(!(stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0))
				{
					/* if not: please wait robots.txt */
					thrdUnBlock(BLOCKTHRDHST);
					Sleep(1000);
					continue;
				}
			}

			if(CheckRobotExclusion(((struct sHost*)nCur->field)->Page)==0)
			{
				((struct sHost*)nCur->field)->viewed = 1;
				thrdUnBlock(BLOCKTHRDHST);
				bRobotsOK=1;
				continue; 
			}

			((struct sHost*)nCur->field)->viewed = 2;
			memcpy(&currentHst,((struct sHost*)nCur->field),sizeof(struct sHost));
            currentHst.HttpETag[0] = '\0';
            currentHst.HttpLastModified[0] = '\0';
            currentHst.HttpContentType[0] = '\0';
            currentHst.HttpContentDisposition[0] = '\0';
		}
		else
		{
			thrdUnBlock(BLOCKTHRDHST);
			Sleep(1000);
			continue;
		}

		//TESTING (before: after Unblockall() )
		thrdStatus[thrdNum]=GetTickCount();

		if(currentHst.type == 3)  //current url is not a html page or a plain text file
		{
			if(nCur==NULL || nCur->field==NULL)
				continue;

			((struct sHost*)nCur->field)->viewed = 1;
			thrdUnBlock(BLOCKTHRDHST);
			bRobotsOK=1;
			continue;
		}

		if(gServerBackoffUntilMS>0)
		{
			DWORD now = GetTickCount();
			if(now < gServerBackoffUntilMS)
				Sleep((int)(gServerBackoffUntilMS - now));
		}

		//(1 - Crawl Delay) Lock the mutex
		{
			int effectiveCrawlDelay = iCrawlDelay;
			if(iServerBackoffCrawlDelay > effectiveCrawlDelay)
				effectiveCrawlDelay = iServerBackoffCrawlDelay;

		if(iRobCrawlDelay>0 || effectiveCrawlDelay>0)
		{
			thrdBlock(BLOCKEXCRAWL);

			if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1)
			{
				UnBlockAll();
				ExitThread(0);
			}
			
			Sleep(  (iRobCrawlDelay>0) ? iRobCrawlDelay*1000 : effectiveCrawlDelay );			
		}
		}


		thrdUnBlock(BLOCKTHRDHST);

		tStart= GetTickCount();

		if(!LoadSocket(&sock,&currentHst,&mSaddr))
		{
			closesocket(sock);
			fprintf(stderr,"\r\n(%i) Socket(%s) error: %s\r\n\r\n",
			        thrdNum,
			        currentHst.Host,
			        OwsSocketSetupErrorString());

			((struct sHost*)nCur->field)->viewed = 1;
			nErrorPages++;
			if(stricmp(currentHst.Host,IndexingHost.Host)==0)
				iDoNextHost = 1;
			bRobotsOK=1;
			
			continue;
		}

			{
				int connectOk = 0;
				int connectTry;
				for (connectTry = 0; connectTry < 3; connectTry++) {
					if (connect(sock, (LPSOCKADDR) &mSaddr, sizeof(mSaddr)) != SOCKET_ERROR) {
						connectOk = 1;
						break;
					}

					int sockErr = OwsSocketLastError();
					/* Retry briefly on transient network errors. */
					if (OwsSocketErrorIsTransient(sockErr)) {
						Sleep(180);
						continue;
					}
					break;
				}
				if (!connectOk)
				{
					int sockErr = OwsSocketLastError();
					fprintf(stderr,"\r\n(%i) Connect(%s) error (errno=%d: %s)\r\n\r\n",
					        thrdNum,currentHst.Host,sockErr,OwsSocketErrorString(sockErr));
					closesocket(sock);

					((struct sHost*)nCur->field)->viewed = 1;
					bRobotsOK=1;

					continue;
				}
			}

			/* Applica timeout I/O al socket del crawler */
			if(!setnonblock(sock, TIMEOUTs))
			{
				fprintf(stderr,"\r\n(%i) set timeout(%s) error\r\n\r\n",thrdNum,currentHst.Host);
				closesocket(sock);
				((struct sHost*)nCur->field)->viewed = 1;
				bRobotsOK=1;
				continue;
			}

			/* Se HTTPS, avvia OpenSSL su questo socket */
        /* Se HTTPS, utilizza OpenSSL su questo socket */
                /* Se HTTPS, utilizza OpenSSL su questo socket */
        if (currentHst.isSSL) {
            if (!g_ssl_ctx) {
                fprintf(stderr,
                        "\r\n(%i) SSL context not initialized for host %s\r\n\r\n",
                        thrdNum, currentHst.Host);
                closesocket(sock);
                ((struct sHost*)nCur->field)->viewed = 1;
                bRobotsOK = 1;
                continue;
            }

            ssl = SSL_new(g_ssl_ctx);
            if (ssl == NULL) {
                fprintf(stderr,
                        "\r\n(%i) SSL_new error on host %s\r\n\r\n",
                        thrdNum, currentHst.Host);
                closesocket(sock);
                ((struct sHost*)nCur->field)->viewed = 1;
                bRobotsOK = 1;
                continue;
            }

            SSL_set_fd(ssl, sock);

            /* 🔹 SNI (Server Name Indication) per siti moderni */
            if (!SSL_set_tlsext_host_name(ssl, currentHst.Host)) {
                fprintf(stderr,
                        "\r\n(%i) SSL_set_tlsext_host_name error on host %s\r\n",
                        thrdNum, currentHst.Host);
                ERR_print_errors_fp(stderr);
            }

            {
                int sslOk = 0;
                int sslTry;
                for (sslTry = 0; sslTry < 10; sslTry++) {
                    int rc = SSL_connect(ssl);
                    if (rc == 1) {
                        sslOk = 1;
                        break;
                    }

                    /* Handshake non completato: in alcuni server serve retry breve */
                    {
                        int sslErr = SSL_get_error(ssl, rc);
                        if (sslErr == SSL_ERROR_WANT_READ || sslErr == SSL_ERROR_WANT_WRITE) {
                            Sleep(120);
                            continue;
                        }
                        if (sslErr == SSL_ERROR_SYSCALL &&
#ifdef WIN32
                            OwsSocketLastError() == WSAEINTR
#else
                            errno == EINTR
#endif
                           ) {
                            continue;
                        }

                        {
                            int sockErr = OwsSocketLastError();
                            fprintf(stderr,
                                    "\r\n(%i) SSL_connect error on host %s (ssl_err=%d, socket_err=%d: %s)\r\n",
                                    thrdNum, currentHst.Host, sslErr, sockErr, OwsSocketErrorString(sockErr));
                        }
                        ERR_print_errors_fp(stderr);
                        break;
                    }
                }

                if (!sslOk) {
                    SSL_shutdown(ssl);
                    SSL_free(ssl);
                    ssl = NULL;
                    closesocket(sock);
                    ((struct sHost*)nCur->field)->viewed = 1;
                    bRobotsOK = 1;
                    continue;
                }
            }
        }

			memset(requestPacket,0,sizeof(requestPacket));
			sStdOut[0]=0;

			        if(!ForgeHTTPPacket(currentHst,requestPacket,sizeof(requestPacket)))
					{
						closesocket(sock);
						((struct sHost*)nCur->field)->viewed = 1;
						bRobotsOK=1;
						continue;
					}

                    if (currentHst.type == 4)
                    {
                        char prevEtag[256];
                        char prevLm[128];
                        prevEtag[0] = '\0';
                        prevLm[0] = '\0';
                        if (OwsDbGetPdfValidators(&currentHst, prevEtag, sizeof(prevEtag), prevLm, sizeof(prevLm)))
                            OwsAppendConditionalHeaders(requestPacket, sizeof(requestPacket), prevEtag, prevLm);
                    }

	        if(currentHst.isSSL && ssl)
	            snd = SSL_write(ssl, requestPacket, (int)strlen(requestPacket));
	        else
	            snd = SEND(sock,requestPacket);

        /* Log URL con schema corretto */
        {
            const char *scheme = currentHst.isSSL ? "https" : "http";
            int defaultPort = currentHst.isSSL ? 443 : PORT;

            if(currentHst.port != defaultPort)
                sprintf(sStdOut,
                        "(%i) Current -> %s://%s:%i%s (%s)",
                        thrdNum, scheme,
                        currentHst.Host, currentHst.port,
                        currentHst.Page, currentHst.Description);
            else
                sprintf(sStdOut,
                        "(%i) Current -> %s://%s%s (%s)",
                        thrdNum, scheme,
                        currentHst.Host, currentHst.Page,
                        currentHst.Description);
        }


        if(snd<10)
        {
            if(currentHst.isSSL && ssl)
            {
                SSL_shutdown(ssl);
                SSL_free(ssl);
                ssl = NULL;
            }
            closesocket(sock);
            strcat(sStdOut,"\t\t[SEND ERROR]\n\n");
            printf("%s",sStdOut);

            ((struct sHost*)nCur->field)->viewed = 1;
            bRobotsOK=1;

            continue;
        }

	                /* ricezione (buffer dinamico) */

	        if (currentHst.isSSL && ssl)
	        {
	            recvdbytes = RecvPacketsSSLDyn(ssl, &packet, &packetCap, MAXHTTPDOWNLOADSIZE);
	        }
	        else
	        {
	            recvdbytes = RecvPacketsDyn(&sock, &packet, &packetCap, MAXHTTPDOWNLOADSIZE);
	        }

        /* Chiudi SSL solo se la connessione era davvero HTTPS */
        if (currentHst.isSSL && ssl)
        {
            SSL_shutdown(ssl);
            SSL_free(ssl);
            ssl = NULL;
        }

        /* Chiudi il socket una sola volta */
        closesocket(sock);

        /* *** IMPORTANTISSIMO ***
         * Garantiamo che 'packet' sia sempre una stringa C terminata da '\0'
         * così ParseHTTPRequest può usare strlen()/strstr() in sicurezza.
         */
	        if (packet)
	            packet[recvdbytes] = '\0';




		//(2 - Crawl Delay) File recived unlock the mutex
		if(iRobCrawlDelay>0 || iCrawlDelay>0)
		{
			thrdUnBlock(BLOCKEXCRAWL);
		}


		if(recvdbytes<=10)
		{
			((struct sHost*)nCur->field)->viewed = 1;
			strcat(sStdOut,"\t\t[RECV ERROR]\n\n");
			printf("%s",sStdOut);
			bRobotsOK=1;

			continue;
		}

		bytesDownloaded+=recvdbytes;

			if (htmlCap < recvdbytes + 1)
			{
				char *newHtml = (char*)realloc(html, recvdbytes + 1);
				if (!newHtml)
				{
					((struct sHost*)nCur->field)->viewed = 1;
					strcat(sStdOut,"\t\t[ALLOC ERROR]\n\n");
					printf("%s",sStdOut);
					bRobotsOK = 1;
					continue;
				}
				html = newHtml;
				htmlCap = recvdbytes + 1;
			}

			if (html)
				html[0] = '\0';
			htmlLength = 0;

		        if ( (HttpRequestRet = ParseHTTPRequest(packet,
	                                                recvdbytes,
	                                                html,
	                                                htmlCap,
	                                                httpStatus,
	                                                sLocation,
	                                                currentHst.level,
	                                                &htmlLength)) != 0 )
		        {
                OwsExtractHeaderValue(packet, "ETag:", currentHst.HttpETag, sizeof(currentHst.HttpETag));
                OwsExtractHeaderValue(packet, "Last-Modified:", currentHst.HttpLastModified, sizeof(currentHst.HttpLastModified));
                OwsExtractHeaderValue(packet, "Content-Type:", currentHst.HttpContentType, sizeof(currentHst.HttpContentType));
                OwsExtractHeaderValue(packet, "Content-Disposition:", currentHst.HttpContentDisposition, sizeof(currentHst.HttpContentDisposition));
                if (nCur && nCur->field)
                {
                    strncpy(((struct sHost*)nCur->field)->HttpETag, currentHst.HttpETag, sizeof(((struct sHost*)nCur->field)->HttpETag) - 1);
                    ((struct sHost*)nCur->field)->HttpETag[sizeof(((struct sHost*)nCur->field)->HttpETag) - 1] = '\0';
                    strncpy(((struct sHost*)nCur->field)->HttpLastModified, currentHst.HttpLastModified, sizeof(((struct sHost*)nCur->field)->HttpLastModified) - 1);
                    ((struct sHost*)nCur->field)->HttpLastModified[sizeof(((struct sHost*)nCur->field)->HttpLastModified) - 1] = '\0';
                    strncpy(((struct sHost*)nCur->field)->HttpContentType, currentHst.HttpContentType, sizeof(((struct sHost*)nCur->field)->HttpContentType) - 1);
                    ((struct sHost*)nCur->field)->HttpContentType[sizeof(((struct sHost*)nCur->field)->HttpContentType) - 1] = '\0';
                    strncpy(((struct sHost*)nCur->field)->HttpContentDisposition, currentHst.HttpContentDisposition, sizeof(((struct sHost*)nCur->field)->HttpContentDisposition) - 1);
                    ((struct sHost*)nCur->field)->HttpContentDisposition[sizeof(((struct sHost*)nCur->field)->HttpContentDisposition) - 1] = '\0';
                }
                OwsProfileRecordHttpFetch(&currentHst, httpStatus, HttpRequestRet, htmlLength);
                if (HttpRequestRet == 2 && bAggressiveIndexMode)
                {
                    int nHeaderAssets = OwsQueueHttpLinkAssets(packet, &currentHst);
                    if (nHeaderAssets > 0)
                    {
                        sprintf(sStdOutTmp, " - Header assets queued (%i URL)\n", nHeaderAssets);
                        strcat(sStdOut, sStdOutTmp);
                    }
                }

	            int isRedirect =
	                ( strnicmp(httpStatus,"HTTP/1.1 302",12) == 0 ||
	                  strnicmp(httpStatus,"HTTP/1.0 302",12) == 0 ||
                  strnicmp(httpStatus,"HTTP/1.1 301",12) == 0 ||
                  strnicmp(httpStatus,"HTTP/1.0 301",12) == 0 );

            /* Se è un 301/302 e abbiamo Location:, aggiungiamo la nuova URL alla coda */
            if (isRedirect && sLocation[0] != '\0' && sLocation[0] != '<')
{
    struct sHost redirHost;
    char normalizedLocation[MAXURLSIZE];
    char dbg[1024];

    /* Usa la URL SOLO se ParseUrl NON restituisce -1 (errore) */
    if (OwsNormalizeRedirectLocation(&currentHst, sLocation, normalizedLocation, sizeof(normalizedLocation)) &&
        ParseUrl(normalizedLocation, &redirHost, &currentHst) != -1)
    {
        redirHost.redirectDepth = currentHst.redirectDepth + 1;
        if (OwsSameHostPage(&redirHost, &currentHst))
        {
            snprintf(dbg, sizeof(dbg),
                     "redirect self skip host=%.100s page=%.255s location=%.255s",
                     currentHst.Host,
                     currentHst.Page,
                     normalizedLocation);
            DEBUG_LOG(dbg);
        }
        else if (lstGetNodeByHost(lstFirst, redirHost) != NULL)
        {
            snprintf(dbg, sizeof(dbg),
                     "redirect duplicate skip host=%.100s page=%.255s location=%.255s",
                     currentHst.Host,
                     currentHst.Page,
                     normalizedLocation);
            DEBUG_LOG(dbg);
        }
        else if (redirHost.redirectDepth <= OWS_MAX_REDIRECT_CHAIN)
        {
            AddUrl(redirHost, currentHst.level + 1, &currentHst);
        }
        else
        {
            snprintf(dbg, sizeof(dbg),
                     "redirect chain skip host=%.100s page=%.255s location=%.255s depth=%u max=%u",
                     currentHst.Host,
                     currentHst.Page,
                     normalizedLocation,
                     (unsigned int)redirHost.redirectDepth,
                     (unsigned int)OWS_MAX_REDIRECT_CHAIN);
            DEBUG_LOG(dbg);
        }
    }
}


            /* Log a video */
            if (isRedirect)
            {
                sprintf(sStdOutTmp,
                        "\n - HTTP header: %s\n - Location: %s\n - Downloaded %u Kb (%u bytes) in %i ms\n",
                        httpStatus,
                        sLocation[0] ? sLocation : "<no Location>",
                        recvdbytes / 1024,
                        recvdbytes,
                        (int)(GetTickCount() - tStart));
            }
            else
            {
                sprintf(sStdOutTmp,
                        "\n - HTTP header: %s\n - Downloaded %u Kb (%u bytes) in %i ms\n",
                        httpStatus,
                        recvdbytes / 1024,
                        recvdbytes,
                        (int)(GetTickCount() - tStart));
            }

            strcat(sStdOut, sStdOutTmp);

            if (OwsIsThrottleStatus(httpStatus) && nCur && nCur->field &&
                ((struct sHost*)nCur->field)->retryCount < 3)
            {
                ((struct sHost*)nCur->field)->retryCount++;
                OwsApplyServerBackoff(packet, httpStatus);
                sprintf(sStdOutTmp,
                        " - Server backoff: retry queued (%u/3), crawl delay %i ms\n\n",
                        ((struct sHost*)nCur->field)->retryCount,
                        iServerBackoffCrawlDelay);
                strcat(sStdOut, sStdOutTmp);
                printf("%s", sStdOut);
                ((struct sHost*)nCur->field)->viewed = 0;
                bRobotsOK = 1;
                continue;
            }

            /* Contatore pagine errore (4xx/5xx) come prima */
            if ( strnicmp(httpStatus,"HTTP/1.1 4",10) == 0 ||
                 strnicmp(httpStatus,"HTTP/1.0 4",10) == 0 ||
                 strnicmp(httpStatus,"HTTP/1.1 5",10) == 0 ||
                 strnicmp(httpStatus,"HTTP/1.0 5",10) == 0 )
            {
                nErrorPages++;
            }
        }
        else
        {
            char profileMsg[1024];
            gProfileHttpError++;
            snprintf(profileMsg,sizeof(profileMsg),
                     "fetch parse_error host=%.100s page=%.255s bytes=%u",
                     currentHst.Host,currentHst.Page,recvdbytes);
            CRAWLER_PROFILE_LOG(profileMsg);
            ((struct sHost*)nCur->field)->viewed = 1;
            strcat(sStdOut,"\t\t[RECV ERROR]\n\n");
            printf("%s",sStdOut);
            bRobotsOK = 1;
            continue;
        }
		
		/* Promote binary assets handled by external modules using HTTP metadata. */
		if (HttpRequestRet == 2 &&
		    currentHst.type != 4 &&
			    (OwsIsPdfResponse(&currentHst, packet, html, htmlLength) ||
	             OwsIsImageResponse(&currentHst, packet, html, htmlLength) ||
	             OwsIsVideoResponse(&currentHst, packet, html, htmlLength) ||
	             OwsIsAudioResponse(&currentHst, packet, html, htmlLength) ||
	             OwsIsAggressiveGenericAssetResponse(&currentHst, packet, html, htmlLength)))
		{
			char dbg[1024];
			currentHst.type = 4;
			if (nCur && nCur->field)
				((struct sHost*)nCur->field)->type = 4;
			snprintf(dbg,sizeof(dbg),
			         "route type=4 host=%s page=%s mime=%s bytes=%u",
			         currentHst.Host,
			         currentHst.Page,
			         currentHst.HttpContentType,
			         htmlLength);
			DEBUG_LOG(dbg);
		}

		if(bRobotsOK==0 && stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0)
		{
			printf("%s",sStdOut);

			if(HttpRequestRet==2)
				ParseRobotsTxt(html,currentHst);
			else
			{
				printf(" - Nothing to do with robots.txt\n\n");

			}

			((struct sHost*)nCur->field)->viewed = 1;
			bRobotsOK=1;

			continue;
		}
		else
			bRobotsOK=1;
		
		
        //Index only HTML(1),plain text files(2)and custom handled files(4)
		if(currentHst.type <= 2 || currentHst.type == 4)
		{

			/* Check the number of pages indexed or if we are switching to the next host */
            if( checkLimits() == 1
                || iDoNextHost==1)	//switch to the next host
			{
				((struct sHost*)nCur->field)->viewed = 1;
				continue;
			}

					nPagesViewed++;

				/* Parse XML sitemap files and queue URLs (including .pdf links). */
				if(HttpRequestRet==2 && currentHst.type==1)
				{
					int nSitemapUrls = ParseSitemapXML(html,currentHst,currentHst.level);
					if(nSitemapUrls>0)
					{
						sprintf(sStdOutTmp," - Sitemap parsed (%i URL queued)\n",nSitemapUrls);
						strcat(sStdOut,sStdOutTmp);
					}
				}

				if(currentHst.type == 1)					//Looks for urls only in html page
				{
					tStart=GetTickCount();
				sprintf(sStdOutTmp," - Checked in %i ms (%i URL found)\n",(int)(GetTickCount()-tStart),LookForUrls(html,currentHst));
				strcat(sStdOut,sStdOutTmp);
			}

			tStart=GetTickCount();

			if(HttpRequestRet==2)	//Index only 200 OK
			{
				if(bUseRegularExpressionA==1)	//are we using a regular expression filter?
				{	//yes
					if(regexec(&regexPageFilter, currentHst.Page, 0, 0, 0) == 0)
					{	//match...index
						tStart=GetTickCount();
						
							if(IndexPage(html,currentHst, htmlLength)==1)
							sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart));
						else
							sprintf(sStdOutTmp,"\n");

						strcat(sStdOut,sStdOutTmp);
					}
					else
					{	//discard
						sprintf(sStdOutTmp,"\n");
						strcat(sStdOut,sStdOutTmp);
					}
				
				}
				else
				{	//index

					tStart=GetTickCount();
						if(IndexPage(html,currentHst, htmlLength)==1)
						sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart));
					else
						sprintf(sStdOutTmp,"\n");

					strcat(sStdOut,sStdOutTmp);
				}
			}

			printf("%s",sStdOut);

		}

		((struct sHost*)nCur->field)->viewed = 1;  


		}/*while(condition)*/

	FREE(packet);
	FREE(html);

	return 0;
}

void KillThreads()
{
int i;

	printf("Killing Threads...\r\n\r\n");

	for(i=0;i<nThread;i++)
	{
#ifdef WIN32
		WaitForSingleObject(thrdhML[i],50000);
		TerminateThread(thrdhML[i],0);
		CloseHandle(thrdhML[i]);
#else
		if(thrdML[i]!=0)
			pthread_join(thrdML[i],NULL);
#endif
	}


	init_mutex();

	printf("Threads killed\r\n\r\n");

	//set all nodes with status==2(indexing) with status=1(indexed)
	lstSetNodeStatus(lstFirst, 2, 1);

	/* if the downloading of the robots.txt timeouts the line above set the file as indexed 
	 * but bRobotsOK is set as 0 (un-parsed) and blocks the spider 
	 * so we have to manually set it a 1 (parsed)
	 */
	bRobotsOK = 1;
	
	/* debug */
	//lstDebugNodes(lstFirst,0);
	//lstDebugNodes(lstFirst,2);
	//lstDebugNodes(lstFirst,1);

	bKillThread=0;

}
/* Wrapper per send/recv con o senza SSL
 *
 * Attualmente non utilizzato: lo lasciamo commentato con #if 0
 * per evitare i warning -Wunused-function ma tenerlo come riferimento.
 */
#if 0
static int ows_send(SOCKET sock, SSL *ssl, const void *buf, size_t len)
{
    if (ssl != NULL) {
        /* SSL_write vuole un buffer const void* */
        return SSL_write(ssl, buf, (int)len);
    } else {
        /* send vuole const void* */
        return send(sock, buf, (int)len, 0);
    }
}

static int ows_recv(SOCKET sock, SSL *ssl, void *buf, size_t len)
{
    if (ssl != NULL) {
        /* SSL_read vuole un buffer void* */
        return SSL_read(ssl, buf, (int)len);
    } else {
        /* recv vuole void* */
        return recv(sock, buf, (int)len, 0);
    }
}
#endif



void CreateThreads()
{
int i;
int errorCode;

	init_mutex();

    printf("\r\n");

    for(i=0;i<nThread;i++)
    {
            printf("\rCreating thread %i of %i     ",i+1,nThread);
            fflush(stdout);

#ifdef WIN32
            thrdhML[i] = (HANDLE)_beginthreadex(NULL,0,mainThread,(void*)(intptr_t)i,0,&thrdML[i]);
            if(thrdhML[i] == 0)
            {
                    printf("\r\nThread error (%lu):\r\n",GetLastError());
                    exit(0);
            }
#else
            if( (errorCode=pthread_create(&thrdML[i], NULL, mainThread, (void*)(intptr_t)i)) != 0 )
            {
                    printf("\r\nThread error (%i):\r\n",errorCode);
                    perror(" -    pthread_create() ");
                    exit(0);
            }
#endif
            thrdStatus[i]=GetTickCount();
    }
    printf("\r\n");
return;
}

void CreateServerThread(int port)
{

#ifdef WIN32
	thrdhServer = (HANDLE)_beginthreadex(NULL,0,StartOWSServer,(void*)(intptr_t)port,0,&thrdServer);
	if(thrdhServer == 0)
	{
            printf("\r\nThread error (%lu):\r\n",GetLastError());
            exit(0);
	}
#else
int errorCode;
    if( (errorCode=pthread_create(&thrdServer, NULL, StartOWSServer, (void*)(intptr_t)port)) != 0 )
    {
            printf("\r\nThread error (%i):\r\n",errorCode);
            perror(" -    pthread_create() ");
            exit(0);
    }
#endif

}

void CreateHandleConnectionThread(struct sHandleConnection* struct_connection)
{

#ifdef WIN32
	if(_beginthreadex(NULL,0,HandleConnection,(void*)struct_connection,0,NULL) == 0)
	{
            printf("\r\nThread error (%lu):\r\n",GetLastError());
            FREE(struct_connection);
	}
#else
int errorCode;
pthread_t ptTmp;
    if( (errorCode=pthread_create(&ptTmp, NULL, HandleConnection, (void*)struct_connection)) != 0 )
    {
            printf("\r\nThread error (%i):\r\n",errorCode);
            perror(" -    pthread_create() ");
            exit(0);
    }
#endif

}


void CheckThreads()
{
DWORD curTickCount;
int i;
int avgSec;

	if(iDoNextHost==0)
	{
		avgSec=0;
		curTickCount=GetTickCount();
		for(i=0;i<nThread;i++)	//Check the status of the threads
		{
			if(curTickCount>thrdStatus[i])
				avgSec+=(curTickCount-thrdStatus[i]);
			else
				avgSec++;

		}//for(i=0;i<nThread;i++)

		avgSec/=nThread;

		if(avgSec>AVGTHREADDELAY)
		{
			ERROR_LOG("Notice: Killing thrads avgSec>100000");
			bKillThread=1;
		}
	}//if(iDoNextHost==0)

return;
}


#if 0
(void)ows_send;
(void)ows_recv;
#endif


#endif

/*EOF*/
