/* OpenWebSpider
 *
 *  Author:     Stefano Alimonti aka Shen139
 *  Mail:       shen139 [at] openwebspider (dot) org
 *
 * This file is part of OpenWebSpider
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <time.h>
#include <openssl/sha.h>

#include "../../platform.h"

#if defined(_WIN32) && !defined(WIN32)
#define WIN32
#endif

#ifdef WIN32
#include <direct.h>
#include <fcntl.h>
#include <io.h>
#include <process.h>
#include <sys/stat.h>
#define mkdir(path, mode) _mkdir(path)
#define unlink _unlink
#define close _close
#define fdopen _fdopen
#define getpid _getpid
#define popen _popen
#define pclose _pclose
static int ows_mkstemp(char *templ)
{
    if (!templ || _mktemp_s(templ, strlen(templ) + 1) != 0)
        return -1;
    return _open(templ, _O_CREAT | _O_EXCL | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE);
}
#define mkstemp ows_mkstemp
#else
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#endif

#include "../modHeader.h"
#if defined(__has_include)
  #if __has_include(<mysql/mysql.h>)
    #include <mysql/mysql.h>
  #elif __has_include(<mysql.h>)
    #include <mysql.h>
  #elif __has_include(<mariadb/mysql.h>)
    #include <mariadb/mysql.h>
  #else
    #include "../../mysql/mysql.h"
  #endif
#else
  #include "../../mysql/mysql.h"
#endif

#ifndef MAXPACKETBUFSIZE
#define MAXPACKETBUFSIZE 200000
#endif

char pdftotextPath[500];
char pdfinfoPath[500];
int gSnippetLines = 12;
static int gPdfSchemaReady = 0;
static int gMediaSchemaReady = 0;

#define PDF_HASH_HEX_LEN 65
#ifdef WIN32
#define OWS_PDF_TMP_IN_TEMPLATE ".\\ows_pdf_in_XXXXXX"
#define OWS_PDF_TMP_OUT_TEMPLATE ".\\ows_pdf_out_XXXXXX"
#else
#define OWS_PDF_TMP_IN_TEMPLATE OWS_TMP_TEMPLATE("ows_pdf_in_XXXXXX")
#define OWS_PDF_TMP_OUT_TEMPLATE OWS_TMP_TEMPLATE("ows_pdf_out_XXXXXX")
#endif
#define PDF_ASSET_ROOT_PRIMARY "/var/lib/openwebspider/assets/pdf"
#define PDF_ASSET_ROOT_FALLBACK OWS_TMP_TEMPLATE("openwebspider/assets/pdf")

typedef struct PDFMETA_TAG
{
    char title[MAXDESCRIPTIONSIZE];
    char author[256];
    char subject[256];
    char created[128];
    char modified[128];
    int pages;
} PDFMETA;

typedef enum
{
    PDF_KIND_NOT_PDF = 0,
    PDF_KIND_DIRECT = 1,
    PDF_KIND_EMBEDDED = 2
} PDF_KIND;

typedef struct PDF_EXTRACT_TAG
{
    PDF_KIND kind;
    const unsigned char *pdfPtr;
    long pdfLen;
    long startOff;
    long endOff;
} PDF_EXTRACT;

int UnToken(char* str,char* Tokens,char* out,int len)
{
int c,i,x=0,tokenfound,y;

y=MIN(len,(signed)strlen(str));

    for(c=0;c<y;c++)
    {
        tokenfound=0;
        for(i=0;i<(signed)strlen(Tokens);i++)
            if(str[c]==Tokens[i])
                tokenfound=1;
        if(tokenfound==0)
            out[x++]=str[c];
    }
    out[x]=0;
return 1;
}

static void trim_inplace(char *s)
{
    int i, start = 0, end;
    if (!s) return;
    while (s[start] && isspace((unsigned char)s[start])) start++;
    if (start > 0) memmove(s, s + start, strlen(s + start) + 1);
    end = (int)strlen(s) - 1;
    while (end >= 0 && isspace((unsigned char)s[end])) s[end--] = '\0';

    for (i = 0; s[i] != '\0'; i++)
    {
        if ((unsigned char)s[i] < 32 && s[i] != '\n' && s[i] != '\t')
            s[i] = ' ';
    }
}

static int starts_with(const char *s, const char *prefix)
{
    if (!s || !prefix) return 0;
    return strncmp(s, prefix, strlen(prefix)) == 0;
}

static int has_pdf_suffix_ci(const char *s)
{
    size_t n;
    char pageOnly[MAXPAGESIZE];
    const char *qmark;
    const char *hash;
    if (!s) return 0;
    qmark = strchr(s, '?');
    hash = strchr(s, '#');
    if (qmark && hash)
        n = (qmark < hash) ? (size_t)(qmark - s) : (size_t)(hash - s);
    else if (qmark)
        n = (size_t)(qmark - s);
    else if (hash)
        n = (size_t)(hash - s);
    else
        n = strlen(s);
    if (n >= sizeof(pageOnly))
        n = sizeof(pageOnly) - 1;
    memcpy(pageOnly, s, n);
    pageOnly[n] = '\0';
    if (n < 4) return 0;
    return (tolower((unsigned char)pageOnly[n - 4]) == '.' &&
            tolower((unsigned char)pageOnly[n - 3]) == 'p' &&
            tolower((unsigned char)pageOnly[n - 2]) == 'd' &&
            tolower((unsigned char)pageOnly[n - 1]) == 'f');
}

static int str_contains_ci(const char *hay, const char *needle)
{
    size_t i, j, hn, nn;
    if (!hay || !needle) return 0;
    hn = strlen(hay);
    nn = strlen(needle);
    if (nn == 0 || hn < nn) return 0;
    for (i = 0; i <= hn - nn; i++)
    {
        for (j = 0; j < nn; j++)
        {
            if (tolower((unsigned char)hay[i + j]) != tolower((unsigned char)needle[j]))
                break;
        }
        if (j == nn)
            return 1;
    }
    return 0;
}

static long find_bytes(const unsigned char *buf, long len, const char *needle, int nlen, long start)
{
    long i;
    if (!buf || !needle || len <= 0 || nlen <= 0 || start < 0) return -1;
    if (start > len - nlen) return -1;
    for (i = start; i <= len - nlen; i++)
    {
        if (memcmp(buf + i, needle, (size_t)nlen) == 0)
            return i;
    }
    return -1;
}

static PDF_EXTRACT extract_pdf_from_buffer(const unsigned char *buf, long len)
{
    PDF_EXTRACT r;
    long s;
    long pos;
    long e;
    long lastEof;
    long end;

    memset(&r, 0, sizeof(r));
    r.kind = PDF_KIND_NOT_PDF;
    r.startOff = -1;
    r.endOff = -1;
    if (!buf || len < 8) return r;

    if (len >= 5 && memcmp(buf, "%PDF-", 5) == 0)
    {
        lastEof = -1;
        pos = 0;
        while ((e = find_bytes(buf, len, "%%EOF", 5, pos)) >= 0)
        {
            lastEof = e;
            pos = e + 5;
        }
        if (lastEof > 0)
            end = lastEof + 5;
        else
            end = len;

        r.kind = PDF_KIND_DIRECT;
        r.pdfPtr = buf;
        r.startOff = 0;
        r.endOff = end;
        r.pdfLen = end;
        return r;
    }

    s = find_bytes(buf, len, "%PDF-", 5, 0);
    if (s < 0) return r;

    lastEof = -1;
    pos = s;
    while ((e = find_bytes(buf, len, "%%EOF", 5, pos)) >= 0)
    {
        lastEof = e;
        pos = e + 5;
    }
    if (lastEof < 0) return r;

    end = lastEof + 5;
    if (end - s < 1024) return r;

    r.kind = PDF_KIND_EMBEDDED;
    r.pdfPtr = buf + s;
    r.startOff = s;
    r.endOff = end;
    r.pdfLen = end - s;
    return r;
}

static int pdf_carrier_score(const struct sHost *host, const PDF_EXTRACT *extract)
{
    int score = 0;
    const char *ct;
    const char *cd;
    const char *page;

    if (!host) return 0;
    ct = host->HttpContentType;
    cd = host->HttpContentDisposition;
    page = host->Page;

    if (ct && str_contains_ci(ct, "application/pdf")) score += 80;
    if (cd && str_contains_ci(cd, "filename=") && str_contains_ci(cd, ".pdf")) score += 30;
    if (page && has_pdf_suffix_ci(page)) score += 20;
    if (extract && extract->kind == PDF_KIND_DIRECT) score += 80;
    if (extract && extract->kind == PDF_KIND_EMBEDDED) score += 20;
    return score;
}

static int ensure_dir_recursive(const char *path)
{
    char tmp[1024];
    char *p;
    size_t len;

    if (!path || path[0] == '\0')
        return 0;

    strncpy(tmp, path, sizeof(tmp) - 1);
    tmp[sizeof(tmp) - 1] = '\0';
    len = strlen(tmp);
    if (len == 0)
        return 0;
    if (tmp[len - 1] == '/')
        tmp[len - 1] = '\0';

    for (p = tmp + 1; *p; p++)
    {
        if (*p == '/')
        {
            *p = '\0';
            if (mkdir(tmp, 0755) != 0 && errno != EEXIST)
                return 0;
            *p = '/';
        }
    }
    if (mkdir(tmp, 0755) != 0 && errno != EEXIST)
        return 0;
    return 1;
}

static int file_exists_nonempty(const char *path)
{
    struct stat st;
    if (!path) return 0;
    if (stat(path, &st) != 0)
        return 0;
    return (S_ISREG(st.st_mode) && st.st_size > 0);
}

static int write_sidecar_json(const char *jsonPath,
                              const char *sourceUrl,
                              const char *etag,
                              const char *lastModified,
                              size_t binSize)
{
    FILE *jf;
    time_t now;
    struct tm *tmv;
    char tbuf[64];

    if (!jsonPath)
        return 0;

    jf = fopen(jsonPath, "wb");
    if (!jf)
        return 0;

    now = time(NULL);
    tmv = gmtime(&now);
    if (tmv)
        strftime(tbuf, sizeof(tbuf), "%Y-%m-%dT%H:%M:%SZ", tmv);
    else
        strcpy(tbuf, "");

    fprintf(jf,
            "{\n"
            "  \"mime\": \"application/pdf\",\n"
            "  \"size\": %lu,\n"
            "  \"saved_at\": \"%s\",\n"
            "  \"source_url\": \"%s\",\n"
            "  \"etag\": \"%s\",\n"
            "  \"last_modified\": \"%s\"\n"
            "}\n",
            (unsigned long)binSize,
            tbuf,
            sourceUrl ? sourceUrl : "",
            etag ? etag : "",
            lastModified ? lastModified : "");
    fclose(jf);
    return 1;
}

static int persist_pdf_asset(const unsigned char *binData,
                             size_t binLen,
                             const char binHash[PDF_HASH_HEX_LEN],
                             const char *sourceUrl,
                             const char *etag,
                             const char *lastModified,
                             char outPath[1024])
{
    const char *root = PDF_ASSET_ROOT_PRIMARY;
    char shardDir[1024];
    char pdfPath[1024];
    char tmpPath[1100];
    char jsonPath[1024];
    FILE *pf;

    if (!binData || binLen == 0 || !binHash || !outPath)
        return 0;
    if (strlen(binHash) < 4)
        return 0;

    snprintf(shardDir, sizeof(shardDir), "%s/%c%c/%c%c", root, binHash[0], binHash[1], binHash[2], binHash[3]);
    if (!ensure_dir_recursive(shardDir))
    {
        root = PDF_ASSET_ROOT_FALLBACK;
        snprintf(shardDir, sizeof(shardDir), "%s/%c%c/%c%c", root, binHash[0], binHash[1], binHash[2], binHash[3]);
        if (!ensure_dir_recursive(shardDir))
            return 0;
    }

    snprintf(pdfPath, sizeof(pdfPath), "%s/%s.pdf", shardDir, binHash);
    snprintf(jsonPath, sizeof(jsonPath), "%s/%s.json", shardDir, binHash);
    strncpy(outPath, pdfPath, 1023);
    outPath[1023] = '\0';

    if (!file_exists_nonempty(pdfPath))
    {
        snprintf(tmpPath, sizeof(tmpPath), "%s.tmp.%ld", pdfPath, (long)getpid());
        pf = fopen(tmpPath, "wb");
        if (!pf)
            return 0;
        if (fwrite(binData, 1, binLen, pf) != binLen)
        {
            fclose(pf);
            unlink(tmpPath);
            return 0;
        }
        fclose(pf);
        if (rename(tmpPath, pdfPath) != 0)
        {
            unlink(tmpPath);
            if (!file_exists_nonempty(pdfPath))
                return 0;
        }
    }

    if (!file_exists_nonempty(jsonPath))
        write_sidecar_json(jsonPath, sourceUrl, etag, lastModified, binLen);

    return 1;
}

static int read_file_into_buffer(const char *path, char *buf, int maxlen)
{
    FILE *pF;
    size_t n;
    if (!path || !buf || maxlen <= 1) return 0;
    pF = fopen(path, "rb");
    if (!pF) return 0;
    n = fread(buf, 1, (size_t)(maxlen - 1), pF);
    fclose(pF);
    buf[n] = '\0';
    return (int)n;
}

static void collapse_spaces(const char *in, char *out, int outMax)
{
    int i = 0;
    int j = 0;
    int prevSpace = 1;

    if (!in || !out || outMax <= 1)
        return;

    for (i = 0; in[i] != '\0' && j < outMax - 1; i++)
    {
        unsigned char c = (unsigned char)in[i];
        if (c == '\r' || c == '\n' || c == '\t' || c == ' ')
        {
            if (!prevSpace)
            {
                out[j++] = ' ';
                prevSpace = 1;
            }
            continue;
        }

        if (c < 32)
            continue;

        out[j++] = (char)c;
        prevSpace = 0;
    }

    if (j > 0 && out[j - 1] == ' ')
        j--;

    out[j] = '\0';
}

static void clean_title_from_path(const char *page, char *out, int outMax)
{
    const char *p;
    int i = 0;

    if (!page || !out || outMax <= 1)
        return;

    out[0] = '\0';
    p = strrchr(page, '/');
    p = (p == NULL) ? page : p + 1;

    while (*p && *p != '?' && *p != '#' && i < outMax - 1)
    {
        char c = *p++;
        if (c == '_' || c == '-')
            c = ' ';
        out[i++] = c;
    }
    out[i] = '\0';

    for (i = 0; out[i] != '\0'; i++)
        out[i] = (char)tolower((unsigned char)out[i]);

    if (strlen(out) > 4)
    {
        int n = (int)strlen(out);
        if (strcmp(out + n - 4, ".pdf") == 0)
            out[n - 4] = '\0';
    }

    trim_inplace(out);
    if (out[0] != '\0')
        out[0] = (char)toupper((unsigned char)out[0]);
}

static void build_snippet(const char *in, char *out, int outmax, int maxLines, char *firstLine, int firstLineMax)
{
    int i, j = 0, lines = 0;
    int lineStart = 1;
    int firstDone = 0;

    if (!in || !out || outmax <= 1) return;
    out[0] = '\0';
    if (firstLine && firstLineMax > 0) firstLine[0] = '\0';

    for (i = 0; in[i] != '\0' && j < outmax - 1; i++)
    {
        unsigned char c = (unsigned char)in[i];

        if (lineStart && (c == '\r' || c == '\n' || c == ' ' || c == '\t'))
            continue;

        if (!firstDone && firstLine && firstLineMax > 1)
        {
            int flen = (int)strlen(firstLine);
            if (c != '\r' && c != '\n' && flen < firstLineMax - 1)
            {
                firstLine[flen] = (char)c;
                firstLine[flen + 1] = '\0';
            }
            if (c == '\r' || c == '\n')
            {
                trim_inplace(firstLine);
                if (firstLine[0] != '\0' && strlen(firstLine) > 3)
                    firstDone = 1;
            }
        }

        if (c == '\r')
            continue;

        out[j++] = (char)c;

        if (c == '\n')
        {
            lines++;
            lineStart = 1;
            if (lines >= maxLines)
                break;
        }
        else
        {
            lineStart = 0;
        }
    }

    out[j] = '\0';
    trim_inplace(out);
    if (firstLine && !firstDone)
        trim_inplace(firstLine);
}

static void build_pdf_description(const char *snippet, const char *firstLine, char *descrOut, int descrMax)
{
    char tmp[4096];
    const char *src;

    if (!descrOut || descrMax <= 1)
        return;

    descrOut[0] = '\0';
    src = (snippet && snippet[0] != '\0') ? snippet : firstLine;
    if (!src)
        return;

    collapse_spaces(src, tmp, sizeof(tmp));
    if (tmp[0] == '\0')
    {
        if (firstLine)
            collapse_spaces(firstLine, tmp, sizeof(tmp));
    }

    if ((int)strlen(tmp) > 300)
        tmp[300] = '\0';

    strncpy(descrOut, tmp, descrMax - 1);
    descrOut[descrMax - 1] = '\0';
}

static void parse_pdfinfo(const char *pdfPath, PDFMETA *meta)
{
    FILE *pp;
    char cmd[1400];
    char line[1200];

    if (!meta)
        return;

    memset(meta, 0, sizeof(PDFMETA));
    meta->pages = 0;

    if (!pdfinfoPath[0] || !pdfPath)
        return;

    snprintf(cmd, sizeof(cmd), "%s \"%s\" 2>/dev/null", pdfinfoPath, pdfPath);
    pp = popen(cmd, "r");
    if (!pp)
        return;

    while (fgets(line, sizeof(line), pp))
    {
        trim_inplace(line);
        if (starts_with(line, "Title:"))
        {
            strncpy(meta->title, line + 6, sizeof(meta->title) - 1);
            trim_inplace(meta->title);
        }
        else if (starts_with(line, "Author:"))
        {
            strncpy(meta->author, line + 7, sizeof(meta->author) - 1);
            trim_inplace(meta->author);
        }
        else if (starts_with(line, "Subject:"))
        {
            strncpy(meta->subject, line + 8, sizeof(meta->subject) - 1);
            trim_inplace(meta->subject);
        }
        else if (starts_with(line, "CreationDate:"))
        {
            strncpy(meta->created, line + 13, sizeof(meta->created) - 1);
            trim_inplace(meta->created);
        }
        else if (starts_with(line, "ModDate:"))
        {
            strncpy(meta->modified, line + 8, sizeof(meta->modified) - 1);
            trim_inplace(meta->modified);
        }
        else if (starts_with(line, "Pages:"))
        {
            meta->pages = atoi(line + 6);
            if (meta->pages < 0)
                meta->pages = 0;
        }
    }

    pclose(pp);
}

static void sha256_hex(const unsigned char *data, size_t len, char hexOut[PDF_HASH_HEX_LEN])
{
    unsigned char digest[SHA256_DIGEST_LENGTH];
    size_t i;

    if (!hexOut)
        return;

    SHA256(data, len, digest);
    for (i = 0; i < SHA256_DIGEST_LENGTH; i++)
        sprintf(hexOut + (i * 2), "%02x", digest[i]);
    hexOut[64] = '\0';
}

static void normalize_text_for_hash(const char *in, char *out, int outMax)
{
    int i = 0;
    int j = 0;
    int prevSpace = 1;

    if (!in || !out || outMax <= 1)
        return;

    for (i = 0; in[i] != '\0' && j < outMax - 1; i++)
    {
        unsigned char c = (unsigned char)in[i];
        if (c == '\r' || c == '\n' || c == '\t' || c == ' ')
        {
            if (!prevSpace)
            {
                out[j++] = ' ';
                prevSpace = 1;
            }
            continue;
        }

        if (c < 32)
            continue;

        if (c < 128)
            out[j++] = (char)tolower(c);
        else
            out[j++] = (char)c;

        prevSpace = 0;
    }

    if (j > 0 && out[j - 1] == ' ')
        j--;

    out[j] = '\0';
}

static char *mysql_escape_dyn(MYSQL *db, const char *src)
{
    size_t srcLen;
    char *dst;

    if (!db)
        return NULL;

    if (!src)
        src = "";

    srcLen = strlen(src);
    dst = (char*)malloc(srcLen * 2 + 1);
    if (!dst)
        return NULL;

    mysql_real_escape_string(db, dst, src, srcLen);
    return dst;
}

static int query_get_int(MYSQL *db, const char *sql, long *value)
{
    MYSQL_RES *res;
    MYSQL_ROW row;

    if (!db || !sql || !value)
        return 0;

    if (mysql_query(db, sql) != 0)
        return 0;

    res = mysql_store_result(db);
    if (!res)
        return 0;

    row = mysql_fetch_row(res);
    if (row && row[0])
    {
        *value = atol(row[0]);
        mysql_free_result(res);
        return 1;
    }

    mysql_free_result(res);
    return 0;
}

static int ensure_pdf_schema(MYSQL *db)
{
    const char *q1 =
        "CREATE TABLE IF NOT EXISTS se_pdf_doc ("
        "doc_id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,"
        "text_sha256 BINARY(32) NOT NULL,"
        "bin_sha256 BINARY(32) NULL,"
        "bin_path VARCHAR(1024) NULL,"
        "bin_size BIGINT UNSIGNED NULL,"
        "carrier_kind VARCHAR(16) NULL,"
        "http_etag VARCHAR(255) NULL,"
        "http_last_modified VARCHAR(255) NULL,"
        "title VARCHAR(512) NULL,"
        "author VARCHAR(255) NULL,"
        "subject VARCHAR(255) NULL,"
        "lang CHAR(5) NULL,"
        "pages INT NULL,"
        "fulltext MEDIUMTEXT NULL,"
        "snippet TEXT NULL,"
        "descr VARCHAR(1024) NULL,"
        "created_raw VARCHAR(128) NULL,"
        "modified_raw VARCHAR(128) NULL,"
        "created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,"
        "updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"
        "PRIMARY KEY (doc_id),"
        "UNIQUE KEY uq_text_sha256 (text_sha256),"
        "KEY idx_bin_sha256 (bin_sha256),"
        "FULLTEXT KEY ft_pdf_fulltext (fulltext)"
        ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci";

    const char *q2 =
        "CREATE TABLE IF NOT EXISTS se_pdf_map ("
        "hostname VARCHAR(100) NOT NULL,"
        "page VARCHAR(255) NOT NULL,"
        "url_id BIGINT UNSIGNED NULL,"
        "doc_id BIGINT UNSIGNED NOT NULL,"
        "first_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,"
        "last_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"
        "PRIMARY KEY (hostname, page),"
        "UNIQUE KEY uq_url_id (url_id),"
        "KEY idx_doc_id (doc_id)"
        ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci";

    const char *q3 =
        "CREATE TABLE IF NOT EXISTS se_pdf_version ("
        "id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,"
        "hostname VARCHAR(100) NOT NULL,"
        "page VARCHAR(255) NOT NULL,"
        "url_id BIGINT UNSIGNED NULL,"
        "doc_id BIGINT UNSIGNED NOT NULL,"
        "version INT NOT NULL,"
        "change_type ENUM('NEW','BIN_CHANGED','TEXT_CHANGED','META_CHANGED') NOT NULL,"
        "indexed_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,"
        "PRIMARY KEY (id),"
        "UNIQUE KEY uq_url_version (hostname, page, version),"
        "KEY idx_doc_id (doc_id),"
        "KEY idx_url_id (url_id)"
        ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci";

    const char *q4 = "ALTER TABLE se_pdf_doc ADD COLUMN IF NOT EXISTS bin_path VARCHAR(1024) NULL";
    const char *q5 = "ALTER TABLE se_pdf_doc ADD COLUMN IF NOT EXISTS bin_size BIGINT UNSIGNED NULL";
    const char *q6 = "ALTER TABLE se_pdf_doc ADD COLUMN IF NOT EXISTS carrier_kind VARCHAR(16) NULL";
    const char *q7 = "ALTER TABLE se_pdf_doc ADD COLUMN IF NOT EXISTS http_etag VARCHAR(255) NULL";
    const char *q8 = "ALTER TABLE se_pdf_doc ADD COLUMN IF NOT EXISTS http_last_modified VARCHAR(255) NULL";

    if (!db)
        return 0;

    if (gPdfSchemaReady == 1)
        return 1;

    if (mysql_query(db, q1) != 0)
        return 0;
    if (mysql_query(db, q2) != 0)
        return 0;
    if (mysql_query(db, q3) != 0)
        return 0;
    mysql_query(db, q4);
    mysql_query(db, q5);
    mysql_query(db, q6);
    mysql_query(db, q7);
    mysql_query(db, q8);

    gPdfSchemaReady = 1;
    return 1;
}

static int ensure_media_schema(MYSQL *db)
{
    const char *q1 =
        "CREATE TABLE IF NOT EXISTS attachments_media ("
        "id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,"
        "hostname VARCHAR(100) NOT NULL,"
        "page VARCHAR(255) NOT NULL,"
        "url_id BIGINT UNSIGNED NULL,"
        "media_kind VARCHAR(16) NOT NULL,"
        "mime VARCHAR(128) NULL,"
        "title VARCHAR(512) NULL,"
        "descr VARCHAR(1024) NULL,"
        "descr_source VARCHAR(16) NULL,"
        "ocr_text MEDIUMTEXT NULL,"
        "author VARCHAR(255) NULL,"
        "software VARCHAR(255) NULL,"
        "artist VARCHAR(255) NULL,"
        "album VARCHAR(255) NULL,"
        "video_codec VARCHAR(64) NULL,"
        "audio_codec VARCHAR(64) NULL,"
        "duration_raw VARCHAR(64) NULL,"
        "bit_rate VARCHAR(64) NULL,"
        "width INT NULL,"
        "height INT NULL,"
        "fps VARCHAR(32) NULL,"
        "sample_rate VARCHAR(64) NULL,"
        "channels VARCHAR(32) NULL,"
        "size_bytes BIGINT UNSIGNED NULL,"
        "http_etag VARCHAR(255) NULL,"
        "http_last_modified VARCHAR(255) NULL,"
        "first_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,"
        "last_seen TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"
        "PRIMARY KEY (id),"
        "UNIQUE KEY uq_media_url (hostname, page),"
        "KEY idx_media_kind (media_kind),"
        "KEY idx_mime (mime),"
        "KEY idx_url_id (url_id)"
        ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci";

    if (!db)
        return 0;
    if (gMediaSchemaReady)
        return 1;
    if (mysql_query(db, q1) != 0)
        return 0;
    mysql_query(db, "ALTER TABLE attachments_media ADD COLUMN IF NOT EXISTS descr_source VARCHAR(16) NULL");
    mysql_query(db, "ALTER TABLE attachments_media ADD COLUMN IF NOT EXISTS ocr_text MEDIUMTEXT NULL");
    mysql_query(db, "ALTER TABLE attachments_media ADD COLUMN IF NOT EXISTS author VARCHAR(255) NULL");
    mysql_query(db, "ALTER TABLE attachments_media ADD COLUMN IF NOT EXISTS software VARCHAR(255) NULL");
    gMediaSchemaReady = 1;
    return 1;
}

static int save_pdf_attachment_meta(FUNCTION_ARGUMENT *arg,
                                    const char *pdfTitle,
                                    const char *pdfDescr,
                                    const PDFMETA *meta,
                                    size_t pdfBinLen)
{
    MYSQL *db;
    char sql[8192];
    char urlExpr[32];
    char *eHost, *ePage, *eTitle, *eDescr, *eAuthor, *eEtag, *eLm;
    long urlId = 0;

    if (!arg || !arg->hostInfo || !arg->mysqlDB2)
        return 0;
    db = (MYSQL*)arg->mysqlDB2;
    if (!ensure_media_schema(db))
        return 0;

    eHost = mysql_escape_dyn(db, arg->hostInfo->Host);
    ePage = mysql_escape_dyn(db, arg->hostInfo->Page);
    eTitle = mysql_escape_dyn(db, pdfTitle ? pdfTitle : "");
    eDescr = mysql_escape_dyn(db, pdfDescr ? pdfDescr : "");
    eAuthor = mysql_escape_dyn(db, meta ? meta->author : "");
    eEtag = mysql_escape_dyn(db, arg->hostInfo->HttpETag);
    eLm = mysql_escape_dyn(db, arg->hostInfo->HttpLastModified);

    snprintf(sql, sizeof(sql), "SELECT id FROM pagelist WHERE hostname='%s' AND page='%s' LIMIT 1", eHost, ePage);
    query_get_int(db, sql, &urlId);
    if (urlId > 0) snprintf(urlExpr, sizeof(urlExpr), "%ld", urlId);
    else strcpy(urlExpr, "NULL");

    snprintf(sql, sizeof(sql),
             "INSERT INTO attachments_media(hostname,page,url_id,media_kind,mime,title,descr,author,size_bytes,http_etag,http_last_modified) VALUES("
             "'%s','%s',%s,'pdf','application/pdf','%s','%s','%s',%u,'%s','%s'"
             ") ON DUPLICATE KEY UPDATE "
             "url_id=VALUES(url_id),media_kind='pdf',mime='application/pdf',title=VALUES(title),descr=VALUES(descr),author=VALUES(author),size_bytes=VALUES(size_bytes),"
             "http_etag=VALUES(http_etag),http_last_modified=VALUES(http_last_modified),last_seen=CURRENT_TIMESTAMP",
             eHost, ePage, urlExpr, eTitle, eDescr, eAuthor, (unsigned int)pdfBinLen, eEtag, eLm);
    mysql_query(db, sql);

    if (eHost) free(eHost);
    if (ePage) free(ePage);
    if (eTitle) free(eTitle);
    if (eDescr) free(eDescr);
    if (eAuthor) free(eAuthor);
    if (eEtag) free(eEtag);
    if (eLm) free(eLm);
    return 1;
}

static int save_pdf_meta(FUNCTION_ARGUMENT *arg,
                         const char *fullText,
                         const char *snippet,
                         const char *pdfTitle,
                         const char *pdfDescr,
                         const PDFMETA *meta,
                         const unsigned char *pdfBin,
                         size_t pdfBinLen,
                         const char *carrierKind)
{
    MYSQL *db;
    char normText[MAXPACKETBUFSIZE];
    char textHash[PDF_HASH_HEX_LEN];
    char binHash[PDF_HASH_HEX_LEN];
    char sql[16384];
    char *eHost;
    char *ePage;
    char *eTitle;
    char *eDescr;
    char *eSnippet;
    char *eFull;
    char *eBinPath;
    char *eHttpEtag;
    char *eHttpLastMod;
    char *eCarrierKind;
    char *eAuthor;
    char *eSubject;
    char *eCreated;
    char *eModified;
    long docId = 0;
    long oldDocId = 0;
    long oldUrlId = 0;
    long urlId = 0;
    long nextVersion = 1;
    int hasOldMap = 0;
    int pages = 0;
    unsigned long long binSize = 0;
    char assetPath[1024];
    char binExpr[96];
    char pagesExpr[32];
    char binSizeExpr[64];
    char binPathExpr[1200];
    char etagExpr[400];
    char lmExpr[400];
    char carrierExpr[64];
    char urlExpr[32];

    if (!arg || !arg->hostInfo || !arg->mysqlDB2)
        return 0;

    db = (MYSQL*)arg->mysqlDB2;

    if (!fullText)
        fullText = "";
    if (!snippet)
        snippet = "";
    if (!pdfTitle)
        pdfTitle = "";
    if (!pdfDescr)
        pdfDescr = "";

    if (!ensure_pdf_schema(db))
        return 0;

    normalize_text_for_hash(fullText, normText, sizeof(normText));
    if (normText[0] == '\0')
        normalize_text_for_hash(snippet, normText, sizeof(normText));
    if (normText[0] == '\0')
        normalize_text_for_hash(arg->hostInfo->Page, normText, sizeof(normText));

    sha256_hex((const unsigned char*)normText, strlen(normText), textHash);

    if (pdfBin && pdfBinLen > 0)
    {
        sha256_hex(pdfBin, pdfBinLen, binHash);
        binSize = (unsigned long long)pdfBinLen;
    }
    else
        binHash[0] = '\0';

    assetPath[0] = '\0';
    if (binHash[0] != '\0')
        persist_pdf_asset(pdfBin,
                          pdfBinLen,
                          binHash,
                          arg->hostInfo->Page,
                          arg->hostInfo->HttpETag,
                          arg->hostInfo->HttpLastModified,
                          assetPath);

    eHost = mysql_escape_dyn(db, arg->hostInfo->Host);
    ePage = mysql_escape_dyn(db, arg->hostInfo->Page);
    eTitle = mysql_escape_dyn(db, pdfTitle);
    eDescr = mysql_escape_dyn(db, pdfDescr);
    eSnippet = mysql_escape_dyn(db, snippet);
    eFull = mysql_escape_dyn(db, fullText);
    eBinPath = mysql_escape_dyn(db, assetPath);
    eHttpEtag = mysql_escape_dyn(db, arg->hostInfo->HttpETag);
    eHttpLastMod = mysql_escape_dyn(db, arg->hostInfo->HttpLastModified);
    eCarrierKind = mysql_escape_dyn(db, carrierKind ? carrierKind : "");
    eAuthor = mysql_escape_dyn(db, meta ? meta->author : "");
    eSubject = mysql_escape_dyn(db, meta ? meta->subject : "");
    eCreated = mysql_escape_dyn(db, meta ? meta->created : "");
    eModified = mysql_escape_dyn(db, meta ? meta->modified : "");

    if (!eHost || !ePage || !eTitle || !eDescr || !eSnippet || !eFull || !eBinPath || !eHttpEtag || !eHttpLastMod || !eCarrierKind || !eAuthor || !eSubject || !eCreated || !eModified)
        goto cleanup;

    pages = (meta && meta->pages > 0) ? meta->pages : 0;

    if (binHash[0] != '\0')
        snprintf(binExpr, sizeof(binExpr), "UNHEX('%s')", binHash);
    else
        strcpy(binExpr, "NULL");

    if (pages > 0)
        snprintf(pagesExpr, sizeof(pagesExpr), "%d", pages);
    else
        strcpy(pagesExpr, "NULL");

    if (binSize > 0)
        snprintf(binSizeExpr, sizeof(binSizeExpr), "%llu", binSize);
    else
        strcpy(binSizeExpr, "NULL");

    if (eBinPath && eBinPath[0] != '\0')
        snprintf(binPathExpr, sizeof(binPathExpr), "'%s'", eBinPath);
    else
        strcpy(binPathExpr, "NULL");

    if (eHttpEtag && eHttpEtag[0] != '\0')
        snprintf(etagExpr, sizeof(etagExpr), "'%s'", eHttpEtag);
    else
        strcpy(etagExpr, "NULL");

    if (eHttpLastMod && eHttpLastMod[0] != '\0')
        snprintf(lmExpr, sizeof(lmExpr), "'%s'", eHttpLastMod);
    else
        strcpy(lmExpr, "NULL");

    if (eCarrierKind && eCarrierKind[0] != '\0')
        snprintf(carrierExpr, sizeof(carrierExpr), "'%s'", eCarrierKind);
    else
        strcpy(carrierExpr, "NULL");

    snprintf(sql, sizeof(sql),
             "INSERT INTO se_pdf_doc(text_sha256,bin_sha256,bin_path,bin_size,carrier_kind,http_etag,http_last_modified,title,author,subject,lang,pages,fulltext,snippet,descr,created_raw,modified_raw) "
             "VALUES(UNHEX('%s'),%s,%s,%s,%s,%s,%s,'%s','%s','%s',NULL,%s,'%s','%s','%s','%s','%s') "
             "ON DUPLICATE KEY UPDATE "
             "bin_sha256=IF(bin_sha256 IS NULL, VALUES(bin_sha256), bin_sha256),"
             "bin_path=IF(bin_path IS NULL OR bin_path='', VALUES(bin_path), bin_path),"
             "bin_size=IF(bin_size IS NULL OR bin_size=0, VALUES(bin_size), bin_size),"
             "carrier_kind=IF(VALUES(carrier_kind) IS NOT NULL AND VALUES(carrier_kind)!='', VALUES(carrier_kind), carrier_kind),"
             "http_etag=IF(VALUES(http_etag) IS NOT NULL AND VALUES(http_etag)!='', VALUES(http_etag), http_etag),"
             "http_last_modified=IF(VALUES(http_last_modified) IS NOT NULL AND VALUES(http_last_modified)!='', VALUES(http_last_modified), http_last_modified),"
             "title=IF(title IS NULL OR title='', VALUES(title), title),"
             "author=IF(author IS NULL OR author='', VALUES(author), author),"
             "subject=IF(subject IS NULL OR subject='', VALUES(subject), subject),"
             "pages=IF(pages IS NULL OR pages=0, VALUES(pages), pages),"
             "snippet=IF(snippet IS NULL OR snippet='', VALUES(snippet), snippet),"
             "descr=IF(descr IS NULL OR descr='', VALUES(descr), descr),"
             "fulltext=IF(fulltext IS NULL OR fulltext='', VALUES(fulltext), fulltext),"
             "created_raw=IF(created_raw IS NULL OR created_raw='', VALUES(created_raw), created_raw),"
             "modified_raw=IF(modified_raw IS NULL OR modified_raw='', VALUES(modified_raw), modified_raw)",
             textHash,
             binExpr,
             binPathExpr,
             binSizeExpr,
             carrierExpr,
             etagExpr,
             lmExpr,
             eTitle,
             eAuthor,
             eSubject,
             pagesExpr,
             eFull,
             eSnippet,
             eDescr,
             eCreated,
             eModified);

    if (mysql_query(db, sql) != 0)
        goto cleanup;

    snprintf(sql, sizeof(sql), "SELECT doc_id FROM se_pdf_doc WHERE text_sha256=UNHEX('%s') LIMIT 1", textHash);
    if (!query_get_int(db, sql, &docId) || docId <= 0)
        goto cleanup;

    snprintf(sql, sizeof(sql), "SELECT doc_id, IFNULL(url_id,0) FROM se_pdf_map WHERE hostname='%s' AND page='%s' LIMIT 1", eHost, ePage);
    if (mysql_query(db, sql) == 0)
    {
        MYSQL_RES *res = mysql_store_result(db);
        if (res)
        {
            MYSQL_ROW row = mysql_fetch_row(res);
            if (row)
            {
                hasOldMap = 1;
                oldDocId = row[0] ? atol(row[0]) : 0;
                oldUrlId = row[1] ? atol(row[1]) : 0;
            }
            mysql_free_result(res);
        }
    }

    snprintf(sql, sizeof(sql), "SELECT id FROM pagelist WHERE hostname='%s' AND page='%s' ORDER BY id DESC LIMIT 1", eHost, ePage);
    if (!query_get_int(db, sql, &urlId))
        urlId = oldUrlId;

    if (urlId > 0)
        snprintf(urlExpr, sizeof(urlExpr), "%ld", urlId);
    else
        strcpy(urlExpr, "NULL");

    snprintf(sql, sizeof(sql),
             "INSERT INTO se_pdf_map(hostname,page,url_id,doc_id,first_seen,last_seen) "
             "VALUES('%s','%s',%s,%ld,NOW(),NOW()) "
             "ON DUPLICATE KEY UPDATE url_id=IFNULL(VALUES(url_id),url_id), doc_id=VALUES(doc_id), last_seen=NOW()",
             eHost, ePage, urlExpr, docId);

    if (mysql_query(db, sql) != 0)
        goto cleanup;

    if (!hasOldMap)
    {
        nextVersion = 1;
        snprintf(sql, sizeof(sql),
                 "INSERT INTO se_pdf_version(hostname,page,url_id,doc_id,version,change_type,indexed_at) "
                 "VALUES('%s','%s',%s,%ld,%ld,'NEW',NOW())",
                 eHost, ePage, urlExpr, docId, nextVersion);
    }
    else if (oldDocId != docId)
    {
        snprintf(sql, sizeof(sql), "SELECT COALESCE(MAX(version),0)+1 FROM se_pdf_version WHERE hostname='%s' AND page='%s'", eHost, ePage);
        if (!query_get_int(db, sql, &nextVersion) || nextVersion < 1)
            nextVersion = 1;

        snprintf(sql, sizeof(sql),
                 "INSERT INTO se_pdf_version(hostname,page,url_id,doc_id,version,change_type,indexed_at) "
                 "VALUES('%s','%s',%s,%ld,%ld,'TEXT_CHANGED',NOW())",
                 eHost, ePage, urlExpr, docId, nextVersion);
    }
    else
    {
        sql[0] = '\0';
    }

    if (sql[0] != '\0')
        mysql_query(db, sql);

cleanup:
    if (eHost) free(eHost);
    if (ePage) free(ePage);
    if (eTitle) free(eTitle);
    if (eDescr) free(eDescr);
    if (eSnippet) free(eSnippet);
    if (eFull) free(eFull);
    if (eBinPath) free(eBinPath);
    if (eHttpEtag) free(eHttpEtag);
    if (eHttpLastMod) free(eHttpLastMod);
    if (eCarrierKind) free(eCarrierKind);
    if (eAuthor) free(eAuthor);
    if (eSubject) free(eSubject);
    if (eCreated) free(eCreated);
    if (eModified) free(eModified);

    return (docId > 0) ? 1 : 0;
}

/* modFilter should return 1 if the current page must be indexed 0 if discarded*/
#ifdef WIN32
extern __declspec(dllexport)
#endif
int modFilter (struct functArg* arg)
{
int inFd, outFd;
FILE* pF;
char command[2000];
char inPdfPath[] = OWS_PDF_TMP_IN_TEMPLATE;
char outTxtPath[] = OWS_PDF_TMP_OUT_TEMPLATE;
char fullText[MAXPACKETBUFSIZE];
char snippet[MAXPACKETBUFSIZE];
char firstLine[MAXDESCRIPTIONSIZE];
char pdfTitle[MAXDESCRIPTIONSIZE];
char pdfDescr[1025];
PDFMETA meta;
char pathTitle[MAXDESCRIPTIONSIZE];
int readLen;
PDF_EXTRACT extract;
const unsigned char *pdfBytes = NULL;
long pdfLen = 0;
int carrierScore = 0;
int shouldTreatAsPdf = 0;
const char *carrierKind = "";

    if(arg)
        if(arg->hostInfo->type == 4)
        {
            if(pdftotextPath[0]==0)
                return 1;

            if (!arg->html || arg->htmlLength <= 0)
                return 1;

            extract = extract_pdf_from_buffer((const unsigned char*)arg->html, (long)arg->htmlLength);
            carrierScore = pdf_carrier_score(arg->hostInfo, &extract);

            if (extract.kind == PDF_KIND_DIRECT)
                shouldTreatAsPdf = 1;
            else if (extract.kind == PDF_KIND_EMBEDDED && carrierScore >= 40)
                shouldTreatAsPdf = 1;
            else if (carrierScore >= 80 && extract.kind != PDF_KIND_NOT_PDF)
                shouldTreatAsPdf = 1;
            else
                shouldTreatAsPdf = 0;

            if (!shouldTreatAsPdf)
                return 1;

            pdfBytes = extract.pdfPtr;
            pdfLen = extract.pdfLen;
            carrierKind = (extract.kind == PDF_KIND_EMBEDDED) ? "EMBEDDED" : "DIRECT";
            if (!pdfBytes || pdfLen <= 0)
                return 1;

            inFd = mkstemp(inPdfPath);
            if(inFd < 0)
                return 0;

            pF = fdopen(inFd,"wb");
            if(!pF)
            {
                close(inFd);
                unlink(inPdfPath);
                return 0;
            }
            fwrite(pdfBytes,1,(size_t)pdfLen,pF);
            fclose(pF);

            outFd = mkstemp(outTxtPath);
            if(outFd < 0)
            {
                unlink(inPdfPath);
                return 0;
            }
            close(outFd);
            unlink(outTxtPath);

            snprintf(command, sizeof(command),
                     "%s -q -enc UTF-8 -nopgbrk \"%s\" \"%s\" 2>/dev/null",
                     pdftotextPath, inPdfPath, outTxtPath);
            system(command);

            memset(fullText,0,sizeof(fullText));
            readLen = read_file_into_buffer(outTxtPath, fullText, sizeof(fullText));

            /* Do not discard PDFs with empty/unextractable text.
               Some files are image-based or protected: keep indexing metadata. */
            if(readLen <= 0)
                fullText[0] = '\0';

            memset(&meta, 0, sizeof(meta));
            parse_pdfinfo(inPdfPath, &meta);

            memset(snippet,0,sizeof(snippet));
            memset(firstLine,0,sizeof(firstLine));
            build_snippet(fullText, snippet, sizeof(snippet), gSnippetLines, firstLine, sizeof(firstLine));

            memset(pdfTitle,0,sizeof(pdfTitle));
            if(meta.title[0] != '\0')
            {
                strncpy(pdfTitle, meta.title, sizeof(pdfTitle)-1);
                pdfTitle[sizeof(pdfTitle)-1] = '\0';
            }
            else if(firstLine[0] != '\0')
            {
                strncpy(pdfTitle, firstLine, sizeof(pdfTitle)-1);
                pdfTitle[sizeof(pdfTitle)-1] = '\0';
            }
            else
            {
                clean_title_from_path(arg->hostInfo->Page, pathTitle, sizeof(pathTitle));
                strncpy(pdfTitle, pathTitle, sizeof(pdfTitle)-1);
                pdfTitle[sizeof(pdfTitle)-1] = '\0';
            }

            memset(pdfDescr, 0, sizeof(pdfDescr));
            build_pdf_description(snippet, firstLine, pdfDescr, sizeof(pdfDescr));

            if (snippet[0] == '\0' && pdfTitle[0] != '\0')
            {
                strncpy(snippet, pdfTitle, sizeof(snippet)-1);
                snippet[sizeof(snippet)-1] = '\0';
            }

            unlink(inPdfPath);
            unlink(outTxtPath);

            if(pdfTitle[0] != '\0')
            {
                strncpy(arg->hostInfo->Description, pdfTitle, MAXDESCRIPTIONSIZE-1);
                arg->hostInfo->Description[MAXDESCRIPTIONSIZE-1] = '\0';
            }
            else if(firstLine[0] != '\0')
            {
                strncpy(arg->hostInfo->Description, firstLine, MAXDESCRIPTIONSIZE-1);
                arg->hostInfo->Description[MAXDESCRIPTIONSIZE-1] = '\0';
            }

            strncpy(arg->text, snippet, MAXPACKETBUFSIZE-1);
            arg->text[MAXPACKETBUFSIZE-1] = '\0';
            arg->textLength = strlen(arg->text);

            save_pdf_meta(arg, fullText, snippet, pdfTitle, pdfDescr, &meta, pdfBytes, (size_t)pdfLen, carrierKind);
            save_pdf_attachment_meta(arg, pdfTitle, pdfDescr, &meta, (size_t)pdfLen);
            return 1;
        }
        else
            return 1;

return 0;
}

#ifdef WIN32
extern __declspec(dllexport)
#endif
int modInitFilter (char* hostname, char* error)
{

FILE* pF;
char sLine[500];
char* eq;

    strncpy(pdftotextPath, OWS_DEFAULT_PDFTOTEXT, sizeof(pdftotextPath)-1);
    pdftotextPath[sizeof(pdftotextPath)-1]=0;
    strncpy(pdfinfoPath, OWS_DEFAULT_PDFINFO, sizeof(pdfinfoPath)-1);
    pdfinfoPath[sizeof(pdfinfoPath)-1]=0;
    gSnippetLines=12;

    pF=ows_fopen_config("mod_pdf.conf","r",NULL,0);
    if(pF==NULL)
    {
        snprintf(error,500,"File not found: mod_pdf.conf or %s/mod_pdf.conf", OWS_SYSCONFDIR);
        return 0;
    }

    while(!feof(pF))
    {
        memset(sLine,0,sizeof(sLine));

        if (!fgets(sLine,499,pF))
            break;

        if(sLine[0]=='#' || sLine[0]=='\r' || sLine[0]=='\n' || sLine[0]==0)
            continue;
        else
        {
            char parsed[500];
            UnToken(sLine,"\r\n",parsed,499);
            trim_inplace(parsed);
            if(parsed[0]==0)
                continue;

            if(starts_with(parsed,"pdftotext="))
            {
                strncpy(pdftotextPath, parsed + 9, sizeof(pdftotextPath)-1);
                pdftotextPath[sizeof(pdftotextPath)-1]=0;
            }
            else if(starts_with(parsed,"pdfinfo="))
            {
                strncpy(pdfinfoPath, parsed + 8, sizeof(pdfinfoPath)-1);
                pdfinfoPath[sizeof(pdfinfoPath)-1]=0;
            }
            else if(starts_with(parsed,"snippet_lines="))
            {
                gSnippetLines = atoi(parsed + 14);
                if(gSnippetLines < 1) gSnippetLines = 1;
                if(gSnippetLines > 200) gSnippetLines = 200;
            }
            else if((eq=strchr(parsed,'='))==NULL && pdftotextPath[0]==0)
            {
                strncpy(pdftotextPath, parsed, sizeof(pdftotextPath)-1);
                pdftotextPath[sizeof(pdftotextPath)-1]=0;
            }
        }
    }

    fclose(pF);

    return 1;
}
