/*
 ** 2015 Jan 30
 **
 ** The author disclaims copyright to this source code.  In place of
 ** a legal notice, here is a blessing:
 **
 **    May you do good and not evil.
 **    May you find forgiveness for yourself and forgive others.
 **    May you share freely, never taking more than you give.
 **
 ******************************************************************************
 **
 ** Implementation based on the "unicode" full-text-search tokenizer.
 */

#include "fts3_html_tokenizer.h"
#include "fts3Int.h"

#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)

#include <assert.h>

// Elements of these two arrays are paired
char *ignore_opening_tags[] = { "sup class=\"marker\"", "head", "footer class=\"study-notes\"", "footer class=\"notes\"" };
char *ignore_closing_tags[] = { "sup", "head", "footer", "footer" };

char *nonbreaking_tags[] = { "ruby" };
char *nonbreaking_ignore_tags[] = { "rp", "rt" };

/*
 ** Return true if the argument interpreted as a unicode codepoint
 ** is a diacritical modifier character.
 */
static int sqlite3FtsUnicodeIsdiacritic(int c) {
    unsigned int mask0 = 0x08029FDF;
    unsigned int mask1 = 0x000361F8;
    if( c<768 || c>817 ) return 0;
    return (c < 768+32) ?
           (mask0 & (1 << (c-768))) :
           (mask1 & (1 << (c-768-32)));
}

/*
 ** Return true if the argument corresponds to a unicode codepoint
 ** classified as either a letter or a number. Otherwise false.
 **
 ** The results are undefined if the value passed to this function
 ** is less than zero.
 */
static int sqlite3FtsUnicodeIsalnum(int c) {
    /* Each unsigned integer in the following array corresponds to a contiguous
     ** range of unicode codepoints that are not either letters or numbers (i.e.
     ** codepoints for which this function should return 0).
     **
     ** The most significant 22 bits in each 32-bit value contain the first
     ** codepoint in the range. The least significant 10 bits are used to store
     ** the size of the range (always at least 1). In other words, the value
     ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
     ** C. It is not possible to represent a range larger than 1023 codepoints
     ** using this format.
     */
    static const unsigned int aEntry[] = {
            0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
            0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
            0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
            0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
            0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
            0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
            0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
            0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
            0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
            0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
            0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
            0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
            0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
            0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
            0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
            0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
            0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
            0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
            0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
            0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
            0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
            0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
            0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
            0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
            0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
            0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
            0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
            0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
            0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
            0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
            0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
            0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
            0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
            0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
            0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
            0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
            0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
            0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
            0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
            0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
            0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
            0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
            0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
            0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
            0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
            0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
            0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
            0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
            0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
            0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
            0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
            0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
            0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
            0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
            0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
            0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
            0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
            0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
            0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
            0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
            0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
            0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
            0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
            0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
            0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
            0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
            0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
            0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
            0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
            0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
            0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
            0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
            0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
            0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
            0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
            0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
            0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
            0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
            0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
            0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
            0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
            0x380400F0,
    };
    static const unsigned int aAscii[4] = {
            0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
    };

    if (c < 128) {
        return ((aAscii[c >> 5] & (1 << (c & 0x001F))) == 0);
    } else if (c < (1 << 22)) {
        unsigned int key = (((unsigned int) c) << 10) | 0x000003FF;
        int iRes = 0;
        int iHi = sizeof(aEntry) / sizeof(aEntry[0]) - 1;
        int iLo = 0;
        while (iHi >= iLo) {
            int iTest = (iHi + iLo) / 2;
            if (key >= aEntry[iTest]) {
                iRes = iTest;
                iLo = iTest + 1;
            } else {
                iHi = iTest - 1;
            }
        }
        assert(aEntry[0] < key);
        assert(key >= aEntry[iRes]);
        return (((unsigned int) c) >= ((aEntry[iRes] >> 10) + (aEntry[iRes] & 0x3FF)));
    }
    return 1;
}

/*
 ** If the argument is a codepoint corresponding to a lowercase letter
 ** in the ASCII range with a diacritic added, return the codepoint
 ** of the ASCII letter only. For example, if passed 235 - "LATIN
 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
 ** E"). The resuls of passing a codepoint that corresponds to an
 ** uppercase letter are undefined.
 */
static int remove_diacritic(int c) {
    unsigned short aDia[] = {
            0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
            2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
            2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
            2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
            3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
            3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
            4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
            6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
            61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
            61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
            62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
            62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
            62924, 63050, 63082, 63274, 63390,
    };
    char aChar[] = {
            '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
            'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
            's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
            'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
            'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
            '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
            'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
            'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
            'e', 'i', 'o', 'u', 'y',
    };

    unsigned int key = (((unsigned int) c) << 3) | 0x00000007;
    int iRes = 0;
    int iHi = sizeof(aDia) / sizeof(aDia[0]) - 1;
    int iLo = 0;
    while (iHi >= iLo) {
        int iTest = (iHi + iLo) / 2;
        if (key >= aDia[iTest]) {
            iRes = iTest;
            iLo = iTest + 1;
        } else {
            iHi = iTest - 1;
        }
    }
    assert(key >= aDia[iRes]);
    return ((c > (aDia[iRes] >> 3) + (aDia[iRes] & 0x07)) ? c : (int) aChar[iRes]);
}

/*
 ** Interpret the argument as a unicode codepoint. If the codepoint
 ** is an upper case character that has a lower case equivalent,
 ** return the codepoint corresponding to the lower case version.
 ** Otherwise, return a copy of the argument.
 **
 ** The results are undefined if the value passed to this function
 ** is less than zero.
 */
static int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic) {
    /* Each entry in the following array defines a rule for folding a range
     ** of codepoints to lower case. The rule applies to a range of nRange
     ** codepoints starting at codepoint iCode.
     **
     ** If the least significant bit in flags is clear, then the rule applies
     ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
     ** need to be folded). Or, if it is set, then the rule only applies to
     ** every second codepoint in the range, starting with codepoint C.
     **
     ** The 7 most significant bits in flags are an index into the aiOff[]
     ** array. If a specific codepoint C does require folding, then its lower
     ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
     **
     ** The contents of this array are generated by parsing the CaseFolding.txt
     ** file distributed as part of the "Unicode Character Database". See
     ** http://www.unicode.org for details.
     */
    static const struct TableEntry {
        unsigned short iCode;
        unsigned char flags;
        unsigned char nRange;
    } aEntry[] = {
            {65, 14, 26}, {181, 64, 1}, {192, 14, 23},
            {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
            {313, 1, 16}, {330, 1, 46}, {376, 116, 1},
            {377, 1, 6}, {383, 104, 1}, {385, 50, 1},
            {386, 1, 4}, {390, 44, 1}, {391, 0, 1},
            {393, 42, 2}, {395, 0, 1}, {398, 32, 1},
            {399, 38, 1}, {400, 40, 1}, {401, 0, 1},
            {403, 42, 1}, {404, 46, 1}, {406, 52, 1},
            {407, 48, 1}, {408, 0, 1}, {412, 52, 1},
            {413, 54, 1}, {415, 56, 1}, {416, 1, 6},
            {422, 60, 1}, {423, 0, 1}, {425, 60, 1},
            {428, 0, 1}, {430, 60, 1}, {431, 0, 1},
            {433, 58, 2}, {435, 1, 4}, {439, 62, 1},
            {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
            {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
            {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
            {497, 2, 1}, {498, 1, 4}, {502, 122, 1},
            {503, 134, 1}, {504, 1, 40}, {544, 110, 1},
            {546, 1, 18}, {570, 70, 1}, {571, 0, 1},
            {573, 108, 1}, {574, 68, 1}, {577, 0, 1},
            {579, 106, 1}, {580, 28, 1}, {581, 30, 1},
            {582, 1, 10}, {837, 36, 1}, {880, 1, 4},
            {886, 0, 1}, {902, 18, 1}, {904, 16, 3},
            {908, 26, 1}, {910, 24, 2}, {913, 14, 17},
            {931, 14, 9}, {962, 0, 1}, {975, 4, 1},
            {976, 140, 1}, {977, 142, 1}, {981, 146, 1},
            {982, 144, 1}, {984, 1, 24}, {1008, 136, 1},
            {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1},
            {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1},
            {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32},
            {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1},
            {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38},
            {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1},
            {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1},
            {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6},
            {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6},
            {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8},
            {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2},
            {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1},
            {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2},
            {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2},
            {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2},
            {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1},
            {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16},
            {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47},
            {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1},
            {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1},
            {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1},
            {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2},
            {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
            {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14},
            {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1},
            {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1},
            {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1},
            {65313, 14, 26},
    };
    static const unsigned short aiOff[] = {
            1, 2, 8, 15, 16, 26, 28, 32,
            37, 38, 40, 48, 63, 64, 69, 71,
            79, 80, 116, 202, 203, 205, 206, 207,
            209, 210, 211, 213, 214, 217, 218, 219,
            775, 7264, 10792, 10795, 23228, 23256, 30204, 54721,
            54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
            57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
            65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
            65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
            65514, 65521, 65527, 65528, 65529,
    };

    int ret = c;

    assert(c >= 0);
    assert(sizeof(unsigned short) == 2 && sizeof(unsigned char) == 1);

    if (c < 128) {
        if (c >= 'A' && c <= 'Z') ret = c + ('a' - 'A');
    } else if (c < 65536) {
        int iHi = sizeof(aEntry) / sizeof(aEntry[0]) - 1;
        int iLo = 0;
        int iRes = -1;

        while (iHi >= iLo) {
            int iTest = (iHi + iLo) / 2;
            int cmp = (c - aEntry[iTest].iCode);
            if (cmp >= 0) {
                iRes = iTest;
                iLo = iTest + 1;
            } else {
                iHi = iTest - 1;
            }
        }
        assert(iRes < 0 || c >= aEntry[iRes].iCode);

        if (iRes >= 0) {
            const struct TableEntry *p = &aEntry[iRes];
            if (c < (p->iCode + p->nRange) && 0 == (0x01 & p->flags & (p->iCode ^ c))) {
                ret = (c + (aiOff[p->flags >> 1])) & 0x0000FFFF;
                assert(ret > 0);
            }
        }

        if (bRemoveDiacritic) ret = remove_diacritic(ret);
    }

    else if (c >= 66560 && c < 66600) {
        ret = c + 40;
    }

    return ret;
}

/*
 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
 ** from the sqlite3 source file utf.c. If this file is compiled as part
 ** of the amalgamation, they are not required.
 */
#ifndef SQLITE_AMALGAMATION

static const unsigned char sqlite3Utf8Trans1[] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
};

#define READ_UTF8(zIn, zTerm, c)                           \
c = *(zIn++);                                            \
if( c>=0xc0 ){                                           \
c = sqlite3Utf8Trans1[c-0xc0];                         \
while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
c = (c<<6) + (0x3f & *(zIn++));                      \
}                                                      \
if( c<0x80                                             \
|| (c&0xFFFFF800)==0xD800                          \
|| (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
}

#define WRITE_UTF8(zOut, c) {                          \
if( c<0x00080 ){                                     \
*zOut++ = (u8)(c&0xFF);                            \
}                                                    \
else if( c<0x00800 ){                                \
*zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
*zOut++ = 0x80 + (u8)(c & 0x3F);                   \
}                                                    \
else if( c<0x10000 ){                                \
*zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
*zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
*zOut++ = 0x80 + (u8)(c & 0x3F);                   \
}else{                                               \
*zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
*zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
*zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
*zOut++ = 0x80 + (u8)(c & 0x3F);                   \
}                                                    \
}

#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct unicode_tokenizer unicode_tokenizer;
typedef struct unicode_cursor unicode_cursor;

struct unicode_tokenizer {
    sqlite3_tokenizer base;
    int bRemoveDiacritic;
    int nException;
    int *aiException;
    char *locale;
};

struct unicode_cursor {
    sqlite3_tokenizer_cursor base;
    const unsigned char *aInput;    /* Input text being tokenized */
    int nInput;                     /* Size of aInput[] in bytes */
    int iOff;                       /* Current offset within aInput[] */
    int iToken;                     /* Index of next token to be returned */
    char *zToken;                   /* storage for current token */
    int nAlloc;                     /* space allocated at zToken */
    struct sb_stemmer *stemmer;     /* Snowball stemmer */
};


/*
 ** Destroy a tokenizer allocated by unicodeCreate().
 */
static int unicodeDestroy(sqlite3_tokenizer *pTokenizer) {
    if (pTokenizer) {
        unicode_tokenizer *p = (unicode_tokenizer *) pTokenizer;
        sqlite3_free(p->aiException);
        sqlite3_free(p);
    }
    return SQLITE_OK;
}

/*
 ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
 ** statement has specified that the tokenizer for this table shall consider
 ** all characters in string zIn/nIn to be separators (if bAlnum==0) or
 ** token characters (if bAlnum==1).
 **
 ** For each codepoint in the zIn/nIn string, this function checks if the
 ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
 ** If so, no action is taken. Otherwise, the codepoint is added to the
 ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
 ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
 ** codepoints in the aiException[] array.
 **
 ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
 ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
 ** It is not possible to change the behaviour of the tokenizer with respect
 ** to these codepoints.
 */
static int unicodeAddExceptions(
        unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
        int bAlnum,                     /* Replace Isalnum() return value with this */
        const char *zIn,                /* Array of characters to make exceptions */
        int nIn                         /* Length of z in bytes */
){
    const unsigned char *z = (const unsigned char *)zIn;
    const unsigned char *zTerm = &z[nIn];
    int iCode;
    int nEntry = 0;

    assert(bAlnum == 0 || bAlnum == 1);

    while (z < zTerm) {
        READ_UTF8(z, zTerm, iCode);
        assert((sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE) == 0);
        if (sqlite3FtsUnicodeIsalnum(iCode) != bAlnum
            && sqlite3FtsUnicodeIsdiacritic(iCode) == 0
                ) {
            nEntry++;
        }
    }

    if (nEntry) {
        int *aNew;                    /* New aiException[] array */
        int nNew;                     /* Number of valid entries in array aNew[] */

        aNew = (int *)sqlite3_realloc(p->aiException, (p->nException + nEntry) * sizeof(int));
        if (aNew == 0) return SQLITE_NOMEM;
        nNew = p->nException;

        z = (const unsigned char *) zIn;
        while (z < zTerm) {
            READ_UTF8(z, zTerm, iCode);
            if (sqlite3FtsUnicodeIsalnum(iCode) != bAlnum
                && sqlite3FtsUnicodeIsdiacritic(iCode) == 0
                    ) {
                int i, j;
                for (i = 0; i < nNew && aNew[i] < iCode; i++);
                for (j = nNew; j > i; j--) aNew[j] = aNew[j - 1];
                aNew[i] = iCode;
                nNew++;
            }
        }
        p->aiException = aNew;
        p->nException = nNew;
    }

    return SQLITE_OK;
}

/*
 ** Return true if the p->aiException[] array contains the value iCode.
 */
static int unicodeIsException(unicode_tokenizer *p, int iCode) {
    if (p->nException > 0) {
        int *a = p->aiException;
        int iLo = 0;
        int iHi = p->nException - 1;

        while (iHi >= iLo) {
            int iTest = (iHi + iLo) / 2;
            if (iCode == a[iTest]) {
                return 1;
            } else if (iCode > a[iTest]) {
                iLo = iTest + 1;
            } else {
                iHi = iTest - 1;
            }
        }
    }

    return 0;
}

/*
 ** Return true if, for the purposes of tokenization, codepoint iCode is
 ** considered a token character (not a separator).
 */
static int unicodeIsAlnum(unicode_tokenizer *p, int iCode) {
    assert((sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE) == 0);
    return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
}

/*
 ** Create a new tokenizer instance.
 */
static int unicodeCreate(
        int nArg,                       /* Size of array argv[] */
        const char * const *azArg,      /* Tokenizer creation arguments */
        sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
){
    unicode_tokenizer *pNew;        /* New tokenizer object */
    int i;
    int rc = SQLITE_OK;

    pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
    if (pNew == NULL) return SQLITE_NOMEM;

    memset(pNew, 0, sizeof(unicode_tokenizer));
    pNew->bRemoveDiacritic = 1;

    for (i = 0; rc == SQLITE_OK && i < nArg; i++) {
        const char *z = azArg[i];
        size_t n = strlen(z);

        if (n == 19 && memcmp("remove_diacritics=1", z, 19) == 0) {
            pNew->bRemoveDiacritic = 1;
        }
        else if (n == 19 && memcmp("remove_diacritics=0", z, 19) == 0) {
            pNew->bRemoveDiacritic = 0;
        }
        else if (n >= 11 && memcmp("tokenchars=", z, 11) == 0) {
            rc = unicodeAddExceptions(pNew, 1, &z[11], n - 11);
        }
        else if (n >= 11 && memcmp("separators=", z, 11) == 0) {
            rc = unicodeAddExceptions(pNew, 0, &z[11], n - 11);
        }
        else if (n > 0) {
            pNew->locale = malloc((n + 1) * sizeof(char));
            strcpy(pNew->locale, z);
            pNew->locale[n] = '\0';
        }
        else {
            /* Unrecognized argument */
            rc = SQLITE_ERROR;
        }
    }

    if (rc != SQLITE_OK) {
        unicodeDestroy((sqlite3_tokenizer *) pNew);
        pNew = 0;
    }
    *pp = (sqlite3_tokenizer *) pNew;
    return rc;
}

/*
 ** Prepare to begin tokenizing a particular string.  The input
 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
 ** used to incrementally tokenize this string is returned in
 ** *ppCursor.
 */
static int unicodeOpen(
        sqlite3_tokenizer *p,           /* The tokenizer */
        const char *aInput,             /* Input string */
        int nInput,                     /* Size of string aInput in bytes */
        sqlite3_tokenizer_cursor **pp   /* OUT: New cursor object */
){
    unicode_cursor *pCsr;

    pCsr = (unicode_cursor *) sqlite3_malloc(sizeof(unicode_cursor));
    if (pCsr == 0) {
        sb_stemmer_delete(pCsr->stemmer);
        return SQLITE_NOMEM;
    }
    memset(pCsr, 0, sizeof(unicode_cursor));

    pCsr->aInput = (const unsigned char *) aInput;
    if (aInput == 0) {
        pCsr->nInput = 0;
    } else if (nInput < 0) {
        pCsr->nInput = (int) strlen(aInput);
    } else {
        pCsr->nInput = nInput;
    }

    unicode_tokenizer *ut = (unicode_tokenizer *) p;
    if (ut->locale != NULL) {
        // Snowball stemmer
        pCsr->stemmer = sb_stemmer_new((const char *) ut->locale, NULL);
    }

    *pp = &pCsr->base;
    UNUSED_PARAMETER(p);
    return SQLITE_OK;
}

/*
 ** Close a tokenization cursor previously opened by a call to
 ** simpleOpen() above.
 */
static int unicodeClose(sqlite3_tokenizer_cursor *pCursor) {
    unicode_cursor *pCsr = (unicode_cursor *) pCursor;
    sqlite3_free(pCsr->zToken);
    sb_stemmer_delete(pCsr->stemmer);
    sqlite3_free(pCsr);
    return SQLITE_OK;
}

/*
 ** Extract the next token from a tokenization cursor.  The cursor must
 ** have been opened by a prior call to simpleOpen().
 */
static int unicodeNext(
        sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
        const char **paToken,           /* OUT: Token text */
        int *pnToken,                   /* OUT: Number of bytes at *paToken */
        int *piStart,                   /* OUT: Starting offset of token */
        int *piEnd,                     /* OUT: Ending offset of token */
        int *piPos                      /* OUT: Position integer of token */
) {
    unicode_cursor *pCsr = (unicode_cursor *) pC;
    unicode_tokenizer *p = ((unicode_tokenizer *) pCsr->base.pTokenizer);
    int iCode;
    char *zOut;
    const unsigned char *z = &pCsr->aInput[pCsr->iOff];
    const unsigned char *zStart = z;
    const unsigned char *zEnd;
    const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];

    /* Scan past any delimiter characters before the start of the next token.
     ** Return SQLITE_DONE early if this takes us all the way to the end of
     ** the input.  */

    char tagEnd[30];
    int numberOfIgnoreTags = sizeof(ignore_opening_tags) / sizeof(ignore_opening_tags[0]);
    int numberOfNonbreakingTags = sizeof(nonbreaking_tags) / sizeof(nonbreaking_tags[0]);
    int numberOfNonbreakingIgnoreTags = sizeof(nonbreaking_ignore_tags) / sizeof(nonbreaking_ignore_tags[0]);

    while (z < zTerm) {

        // Start Ignore HTML tags
        if (z[0] == '<') {
            iCode = *(z++);

            if (z[0] != '/') {
                int ignoredTag = 0;
                for (int i = 0; i < numberOfIgnoreTags; i++) {
                    char *ignoreOpeningTag = ignore_opening_tags[i];
                    size_t openingLength = strlen(ignoreOpeningTag);
                    if (!strncasecmp(z, ignoreOpeningTag, openingLength)) {
                        iCode = *(z += openingLength);
                        ignoredTag = 1;

                        char *ignoreClosingTag = ignore_closing_tags[i];
                        size_t closingLength = strlen(ignoreClosingTag);
                        sprintf(tagEnd, "%s>", ignoreClosingTag);

                        // Find location of end tag
                        char *found = strstr(z, tagEnd);
                        if (found != NULL) {
                            iCode = *(z += (found - (char *)z) + closingLength);
                            break;
                        }
                    }
                }
                if (ignoredTag == 0) {
                    for (int i = 0; i < numberOfNonbreakingIgnoreTags; i++) {
                        char *ignoreTag = nonbreaking_ignore_tags[i];
                        size_t length = strlen(ignoreTag);
                        if (!strncasecmp(z, ignoreTag, length)) {
                            iCode = *(z += length);
                            ignoredTag = 1;

                            sprintf(tagEnd, "%s>", ignoreTag);

                            // Find location of end tag
                            char *found = strstr(z, tagEnd);
                            if (found != NULL) {
                                iCode = *(z += (found - (char *)z) + length);
                                break;
                            }
                        }
                    }
                }
            }

            while (z != zTerm && z[0] != '>') {
                iCode = *(z++);
            }
        }
        // End Ignore HTML Tags

        READ_UTF8(z, zTerm, iCode);

        if (unicodeIsAlnum(p, iCode)) {
            break;
        }

        zStart = z;
    }

    if (zStart >= zTerm) {
        return SQLITE_DONE;
    }

    zOut = pCsr->zToken;
    do {
        int iOut;

        /* Grow the output buffer if required. */
        if ((zOut - pCsr->zToken) >= (pCsr->nAlloc - 4)) {
            char *zNew = (char *)sqlite3_realloc(pCsr->zToken, pCsr->nAlloc + 64);
            if (!zNew) {
                return SQLITE_NOMEM;
            }
            zOut = &zNew[zOut - pCsr->zToken];
            pCsr->zToken = zNew;
            pCsr->nAlloc += 64;
        }

        /* Write the folded case of the last character read to the output */
        zEnd = z;
        iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);
        if (iOut) {
            WRITE_UTF8(zOut, iOut);
        }

        /* If the cursor is not at EOF, read the next character */
        if (z >= zTerm) {
            break;
        }

        // Start Ignore HTML tags
        while (z[0] == '<') {
            if (z[1] == '/') {
                int ignoring_tag = 0;
                for (int i = 0; i < numberOfNonbreakingTags; i++) {
                    char *ignoreTag = nonbreaking_tags[i];
                    size_t length = strlen(ignoreTag);
                    if (!strncasecmp(z + 2, ignoreTag, length)) {
                        ignoring_tag = 1;
                        iCode = *(z += 2 + length);

                        while (z != zTerm && z[0] != '>') {
                            iCode = *(z++);
                        }

                        iCode = *(z++);
                    }
                }

                if (ignoring_tag == 0) {
                    break;
                }
            } else {
                int ignoring_tag = 0;
                for (int i = 0; i < numberOfNonbreakingIgnoreTags; i++) {
                    char *ignoreTag = nonbreaking_ignore_tags[i];
                    size_t length = strlen(ignoreTag);
                    if (!strncasecmp(z + 1, ignoreTag, length)) {
                        ignoring_tag = 1;
                        iCode = *(z += 1 + length);

                        sprintf(tagEnd, "%s>", ignoreTag);

                        // Find location of end tag
                        char *found = strstr(z, tagEnd);
                        if (found != NULL) {
                            iCode = *(z += (found - (char *)z) + length);

                            while (z != zTerm && z[0] != '>') {
                                iCode = *(z++);
                            }

                            iCode = *(z++);
                            break;
                        }
                    }
                }

                if (ignoring_tag != 0) {
                    continue;
                }

                ignoring_tag = 0;
                for (int i = 0; i < numberOfNonbreakingTags; i++) {
                    char *ignoreTag = nonbreaking_tags[i];
                    size_t length = strlen(ignoreTag);
                    if (!strncasecmp(z + 1, ignoreTag, length)) {
                        ignoring_tag = 1;
                        iCode = *(z += 1 + length);

                        while (z != zTerm && z[0] != '>') {
                            iCode = *(z++);
                        }

                        iCode = *(z++);
                    }
                }

                if (ignoring_tag == 0) {
                    break;
                }
            }
        }
        // End Ignore HTML Tags

        if (z[0] == '<') {
            break;
        }

        READ_UTF8(z, zTerm, iCode);

    } while (unicodeIsAlnum(p, iCode) || sqlite3FtsUnicodeIsdiacritic(iCode));

    int overflow = strlen(zOut);
    if (overflow > 0) {
        pCsr->zToken[strlen(pCsr->zToken) - overflow] = '\0';
    }

    /* Set the output variables and return. */
    pCsr->iOff = (z - pCsr->aInput);

    const sb_symbol *stemmed = NULL;
    if (pCsr->stemmer != NULL) {
        stemmed = sb_stemmer_stem(pCsr->stemmer, (sb_symbol *) pCsr->zToken, zOut - pCsr->zToken);
    }

    if (stemmed) {
        *paToken = (char *)stemmed;
        *pnToken = strlen((char *)stemmed);
    } else {
        *paToken = pCsr->zToken;
        *pnToken = zOut - pCsr->zToken;
    }

    *piStart = (zStart - pCsr->aInput);
    *piEnd = (zEnd - pCsr->aInput);
    *piPos = pCsr->iToken++;

    return SQLITE_OK;
}

static const sqlite3_tokenizer_module unicode_module = {
        0,
        unicodeCreate,
        unicodeDestroy,
        unicodeOpen,
        unicodeClose,
        unicodeNext,
};

/*
 ** Set *ppModule to a pointer to the sqlite3_tokenizer_module
 ** structure for the unicode tokenizer.
 */
void set_html_tokenizer_module(sqlite3_tokenizer_module const **ppModule){
    *ppModule = &unicode_module;
}

/*
 ** SQLite user defined function to use with matchinfo() to calculate the
 ** relevancy of an FTS match. The value returned is the relevancy score
 ** (a real value greater than or equal to zero). A larger value indicates
 ** a more relevant document.
 **
 ** The overall relevancy returned is the sum of the relevancies of each
 ** column value in the FTS table. The relevancy of a column value is the
 ** sum of the following for each reportable phrase in the FTS query:
 **
 **   (<hit count> / <global hit count>) * <column weight>
 **
 ** where <hit count> is the number of instances of the phrase in the
 ** column value of the current row and <global hit count> is the number
 ** of instances of the phrase in the same column of all rows in the FTS
 ** table. The <column weight> is a weighting factor assigned to each
 ** column by the caller (see below).
 **
 ** The first argument to this function must be the return value of the FTS
 ** matchinfo() function. Following this must be one argument for each column
 ** of the FTS table containing a numeric weight factor for the corresponding
 ** column. Example:
 **
 **     CREATE VIRTUAL TABLE documents USING fts3(title, content)
 **
 ** The following query returns the docids of documents that match the full-text
 ** query <query> sorted from most to least relevant. When calculating
 ** relevance, query term instances in the 'title' column are given twice the
 ** weighting of those in the 'content' column.
 **
 **     SELECT docid FROM documents
 **     WHERE documents MATCH <query>
 **     ORDER BY rank(matchinfo(documents), 1.0, 0.5) DESC
 */
static void ftsRankFunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){
    int *aMatchinfo;                /* Return value of matchinfo() */
    int nMatchinfo;                 /* Number of elements in aMatchinfo[] */
    int nCol = 0;                   /* Number of columns in the table */
    int nPhrase = 0;                /* Number of phrases in the query */
    int iPhrase;                    /* Current phrase */
    double score = 0.0;             /* Value to return */

    assert( sizeof(int)==4 );

    /* Check that the number of arguments passed to this function is correct.
     ** If not, jump to wrong_number_args. Set aMatchinfo to point to the array
     ** of unsigned integer values returned by FTS function matchinfo. Set
     ** nPhrase to contain the number of reportable phrases in the users full-text
     ** query, and nCol to the number of columns in the table. Then check that the
     ** size of the matchinfo blob is as expected. Return an error if it is not.
     */
    if( nVal<1 ) goto wrong_number_args;
    aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]);
    nMatchinfo = sqlite3_value_bytes(apVal[0]) / sizeof(int);
    if( nMatchinfo>=2 ){
        nPhrase = aMatchinfo[0];
        nCol = aMatchinfo[1];
    }
    if( nMatchinfo!=(2+3*nCol*nPhrase) ){
        sqlite3_result_error(pCtx,
                             "invalid matchinfo blob passed to function ftsRankfunc()", -1);
        return;
    }
    if( nVal!=(1+nCol) ) goto wrong_number_args;

    /* Iterate through each phrase in the users query. */
    for(iPhrase=0; iPhrase<nPhrase; iPhrase++){
        int iCol;                     /* Current column */

        /* Now iterate through each column in the users query. For each column,
         ** increment the relevancy score by:
         **
         **   (<hit count> / <global hit count>) * <column weight>
         **
         ** aPhraseinfo[] points to the start of the data for phrase iPhrase. So
         ** the hit count and global hit counts for each column are found in
         ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively.
         */
        int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3];
        for(iCol=0; iCol<nCol; iCol++){
            int nHitCount = aPhraseinfo[3*iCol];
            int nGlobalHitCount = aPhraseinfo[3*iCol+1];
            double weight = sqlite3_value_double(apVal[iCol+1]);
            if( nHitCount>0 ){
                score += ((double)nHitCount / (double)nGlobalHitCount) * weight;
            }
        }
    }

    sqlite3_result_double(pCtx, score);
    return;

    /* Jump here if the wrong number of arguments are passed to this function */
    wrong_number_args:
    sqlite3_result_error(pCtx, "wrong number of arguments to function ftsRankfunc()", -1);
};


int registerTokenizer(sqlite3 *db, const char *zName) {
    int rc;
    sqlite3_stmt *pStmt;
    const char *zSql = "SELECT fts3_tokenizer(?, ?)";
    const sqlite3_tokenizer_module *p = &unicode_module;

    rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
    if (rc != SQLITE_OK) {
        return rc;
    }

    sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
    sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
    sqlite3_step(pStmt);

    sqlite3_create_function(db, "ftsrank", -1, SQLITE_ANY, 0, ftsRankFunc, 0, 0);

    return sqlite3_finalize(pStmt);
}

#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */