#include <cstdio>      // *printf
#include <unistd.h>    // fileno,fork,pipe
#include <csignal>     // signal,kill
#include <sys/wait.h>  // waitpid
#include <cctype>      // isalpha,...
#include <map>         // map
#include <set>         // set
#include <list>        // list
#include <cerrno>      // errno
#include <cstdlib>     // perror
#include <cstring>     // memmove

#include "htmlrecode.hh"

#define DEBUG 0

//#define GXBLT

using namespace std;

static const char *const midset  // do endian tests and find internal type
     = (sizeof(wchar_t) == 4)  // if wchar_t is 32-bit
             ? ((*(const short *)"\1\0\0\0" == 1) ? "UCS-4LE" : "UCS-4BE")
     : (sizeof(wchar_t) == 2)  // if wchar_t is 16-bit
             ? ((*(const short *)"\1\0\0\0" == 1) ? "UCS-2LE" : "UCS-2BE")
     : NULL;                   // otherwise we are lost


#if 0
static bool operator==(const wstring &s1, const char *s2)
{
    for(unsigned a=0; a<s1.size(); ++a, ++s2)
    {
        if(!*s2 || s1[a] != (unsigned char)*s2)return false;
    }
    return *s2 ? false : true;
}
#endif
static void operator+= (wstring &s1, const char *s2)
{
    while(*s2)s1 += *s2++;
}
static bool IsEqual(const wstring &s1, const char *s2)
{
    for(size_t a=0; a<s1.size(); ++a, ++s2)
    {
        if(!*s2)return false;
        if(s1[a] >= 0x100)return false;
        char c1 = (char)s1[a];
        if(toupper(c1) != toupper(*s2))return false;
    }
    return !*s2;
}
static bool IsEqual(const wstring &s1, const wstring &s2)
{
    if(s1.size() != s2.size()) return false;
    for(unsigned a=0; a<s1.size(); ++a)
    {
        if(s1[a] == s2[a]
        || (s1[a] <= 0x100 && s2[a] <= 0x100
        && toupper((char)s1[a]) == toupper((char)s2[a]))
          ) continue;
        return false;
    }
    return true;
}

static const string Stringify(const wstring &s)
{
    string result(s.size(), '?');
    for(unsigned a=0; a<s.size(); ++a)
        if(s[a] < 0x100) result[a] = (char)s[a];
    return result;
}

static ucs4 Getc(FILE *fp)
{
    ucs4 p; fread(&p, 1, 4, fp);
    return p;
}

static const struct { const char *ent; ucs4 ch; } EnTab[] = {
#include "entities.h"
};
#define ENTITYCOUNT (sizeof(EnTab) / sizeof(EnTab[0]))
static ucs4 FindEntity(const wstring &ws)
{
    string s(ws.size(), ' ');
    for(unsigned a=0; a<ws.size(); ++a)
        s[a] = (char)ws[a];
    
    unsigned eka=0, vika=ENTITYCOUNT-1;
    for(;;)
    {
        unsigned n = (eka+vika)/2;
        if(s == EnTab[n].ent)return EnTab[n].ch;
        if(eka >= vika)break;
        if(s > EnTab[n].ent) { eka=n+1; continue; }
        if(s < EnTab[n].ent) { vika=n; continue; }
    }
    return ilseq;
}

#undef putc
#undef puts

static unsigned FixedStyleScript = 0;
static unsigned ParamViolations  = 0;

static bool lossless = true;
static bool usehex = false;
static bool strict = false;
static int verbose = 1;
static bool xmlmode = false;
static bool signature = false;

bool Page::Dumper::OpenConv(iconv_t &conv, const char *set1, const char *set2)
{
    conv = iconv_open(set1, set2);
    if(conv == (iconv_t)-1)
    {
        fprintf(stderr, "iconv_open failed to create '%s' to '%s' converter. Aborting.\n",
            set1, set2);
        return true;
    }
    return false;
}

Page::Dumper::Dumper() : charset(midset)
{
    if(OpenConv(converter, charset.c_str(), midset)
    || OpenConv(tester,    charset.c_str(), midset))
        exit(EINVAL);
}
Page::Dumper::~Dumper()
{
    iconv_close(converter);
    iconv_close(tester);
}

void Page::Dumper::putc(ucs4 p) const
{
    wstring tmp;
    tmp += p;
    puts(tmp);
}

bool Page::Dumper::isok(ucs4 p) const
{
    char OutBuf[256], *outptr = OutBuf, *tmp = (char *)&p;
    size_t outsize = sizeof OutBuf;
    size_t insize = sizeof(p);
    size_t retval = iconv(tester, &tmp, &insize, &outptr, &outsize);
    if(retval == (size_t)-1)return false;
    return true;
}

static const wstring makewstr(const char *s) { wstring tmp; tmp += s; return tmp; }

struct Page::Tag
{
    class Param: public ptrable
    {
    public:
        Param() {}
        virtual ~Param() {}
    };
    class ParamParam: public Param
    {
    public:
        wstring name, value;
    };
    class ParamKey: public Param
    {
    public:
        wstring name;
    };
    class ParamComm: public Param
    {
    public:
        wstring data;
    };
    class ParamSpace: public Param
    {
    public:
        wstring data;
    };

    typedef autoptr<Param> itemp_t;
    typedef list<itemp_t> list_t;
    list_t items;
    wstring Name;
    bool terminating;
    
    typedef list_t::const_iterator const_iterator;
    typedef list_t::iterator iterator;
    
    void SetParam(const wstring &name,
                  const wstring &value)
    {
        ParamParam *tmp = new ParamParam;
        tmp->name = name;
        tmp->value = value;
        items.push_back(tmp);
    }
    void SetKey(const wstring &name)
    {
        ParamKey *tmp = new ParamKey;
        tmp->name = name;
        items.push_back(tmp);
    }
    void AddComment(const wstring &data)
    {
        ParamComm *tmp = new ParamComm;
        tmp->data = data,
        items.push_back(tmp);
    }
    void AddSpace(const wstring &data)
    {
        ParamSpace *tmp = new ParamSpace;
        tmp->data = data,
        items.push_back(tmp);
    }
    
    template<typename str>
    bool HasParamCalled(const str &name) const
    {
        for(const_iterator i=items.begin(); i!=items.end(); ++i)
        {
            const Param *p = *i;
            if(const ParamParam *param = dynamic_cast<const ParamParam *> (p))
                if(IsEqual(param->name, name)) return true;
        }
        return false;
    }
    template<typename str>
    const wstring GetParamValue(const str &name) const
    {
        for(const_iterator i=items.begin(); i!=items.end(); ++i)
        {
            const Param *p = *i;
            if(const ParamParam *param = dynamic_cast<const ParamParam *> (p))
                if(IsEqual(param->name, name))
                    return param->value;
        }
        wstring tmp;
        return tmp;
    }
    template<typename str>
    wstring &GetParamValue(const str &name)
    {
        wstring *tmp = NULL;
        for(iterator i=items.begin(); i!=items.end(); ++i)
        {
            Param *p = *i;
            if(ParamParam *param = dynamic_cast<ParamParam *> (p))
            {
                tmp = &param->value;
                if(IsEqual(param->name, name))
                    break;
            }
        }
        return *tmp;
    }
    template<typename str>
    void ReplaceParam(const str &name, const wstring &value)
    {
        for(iterator i=items.begin(); i!=items.end(); ++i)
        {
            Param *p = *i;
            if(ParamParam *param = dynamic_cast<ParamParam *> (p))
                if(IsEqual(param->name, name))
                    param->value = value;
        }
    }
    
    void clear()
    {
        CLEARSTR(Name);
        items.clear();
        terminating = false;
    }
    bool Is(const char *t) const { return IsEqual(Name, t); }
    bool Is(const wstring &s) const { return IsEqual(Name, s); }
};

struct Page::PI : public Page::Tag
{
    void ParseParams()
    {
        wstring param;
        wstring value;
        
        int state=0;
        
        items.clear();
        
        for(unsigned a=0; a<Content.size(); ++a)
        {
            ucs4 c = Content[a];
            switch(state)
            {
                case 0:
                    if(c==' ' || c=='\t' || c == '\v' || c == '\r' || c == '\n')break;
                    CLEARSTR(param);
                    state=1; //passthru
                case 1:
                    if(c=='=') {CLEARSTR(value);state=2;break;}
                    param += c;
                    break;
                case 2:
                    if(c=='"') {state=3;break;}
                    // non-" is invalid
                    SetParam(param, value);
                    state=0;
                    break;
                case 3:
                    if(c=='"') {state=2;break;}
                    value += c;
                    break;
            }
        }
        if(param.size())
            if(value.size()) SetParam(param, value);
            else SetKey(param);
    }
public:
    wstring Content;
    PI() {}
    PI(const wstring &nam, const wstring &con) : Content(con)
    {
        Name = nam;
        ParseParams();
    }
    
    void Reconstruct()
    {
        CLEARSTR(Content);
        
        const_iterator i;
        for(i=items.begin(); i!=items.end(); ++i)
        {
            const Tag::Param *p = *i;
            if(const Tag::ParamParam *param = dynamic_cast<const Tag::ParamParam *> (p))
            {
                if(Content.size())Content += ' ';
                Content += param->name;
                Content += "=\"";
                Content += param->value;
                Content += '"';
            }
            else if(const Tag::ParamKey *key = dynamic_cast<const Tag::ParamKey *> (p))
            {
                if(Content.size())Content += ' ';
                Content += key->name;
            }
        }
        ParseParams();
    }
};

class Page::ElemBody : public Page::Element
{
public:
    wstring body;
    ElemBody(const wstring &b) : body(b)
    {
    }
};

class Page::ElemTag : public Page::Element
{
public:
    struct Tag tag;
    ElemTag(const struct Tag &t) : tag(t)
    {
    }
};

class Page::ElemPI : public Page::Element
{
    // XML processing information (<? ?>)
public:
    struct PI pi;
    ElemPI(const struct PI &p) : pi(p)
    {
    }
};

class Page::ElemRaw : public Page::Element
{
public:
    wstring data;
    ElemRaw(const wstring &b) : data(b)
    {
    }
};

void Page::DumpTag(const Tag &tag) const
{
    Putc('<');
    Dump(tag.Name);
    
    Tag::const_iterator i;

    for(i=tag.items.begin(); i!=tag.items.end(); ++i)
    {
        const Tag::Param *p = *i;
        if(const Tag::ParamParam *param = dynamic_cast<const Tag::ParamParam *> (p))
        {
            Dump(param->name);
            Putc('=');
            const wstring &s = param->value;
            bool needquotes = false;
            
            unsigned countq2 = 0;
            unsigned countq1 = 0;
            
            for(unsigned a=0; a<s.size(); ++a)
            {
                if(s[a] >= 'A' && s[a] <= 'Z')continue;
                if(s[a] >= 'a' && s[a] <= 'z')continue;
                if(s[a] >= '0' && s[a] <= '9')continue;
                if(s[a] == '-' || s[a] == '.'
                || s[a] == '_' || s[a] == ':')continue;
                if(s[a] == '"') ++countq2;
                if(s[a] == '\'') ++countq1;
                needquotes = true;
                break;
            }
            
            if(!s.size()) needquotes = true;
            
            if(needquotes || xmlmode)
            {
                char quotetype = (countq2 <= countq1 || xmlmode) ? '"' : '\'';
                Putc(quotetype);
                DumpHTML(s, quotetype);
                Putc(quotetype);
            }
            else
                DumpHTML(s);
        }
        else if(const Tag::ParamKey *key = dynamic_cast<const Tag::ParamKey *> (p))
        {
            Dump(key->name);
        }
        else if(const Tag::ParamComm *comm = dynamic_cast<const Tag::ParamComm *> (p))
        {
            Putc('-'); Putc('-');
            Dump(comm->data);
            Putc('-'); Putc('-');
        }
        else if(const Tag::ParamSpace *spc = dynamic_cast<const Tag::ParamSpace *> (p))
        {
            Dump(spc->data);
        }
    }

    if(tag.Name.size() > 0
    && tag.Name[0] == '?')
    {
        Putc('?');
    }
    if(tag.terminating)Putc(' '),Putc('/');
    Putc('>');
}

void Page::DumpRaw(const wstring &data) const
{
    bool needcomments = false;
    for(unsigned a=0; a<data.size(); ++a)
    {
        if(data[a] == ' ' || data[a] == '\n'
        || data[a] == '\t' || data[a] == '\r')continue;
        needcomments = !IsEqual(data.substr(a, 4), "<!--");
        break;
    }
    if(needcomments)
    {
        FixedStyleScript++;
    }
    if(needcomments)
        Putc('<'), Putc('!'), Putc('-'), Putc('-'), Putc('\n');
    Dump(data);
    if(needcomments)
        Putc('\n'), Putc('-'), Putc('-'), Putc('>');
}

void Page::DumpPI(const PI &pi) const
{
    Putc('<');
    Dump(pi.Name);
    Putc(' ');
    Dump(pi.Content);
    Putc('?');
    Putc('>');
    /* Debug - don't use
    Tag::const_iterator i;
    for(i=pi.items.begin(); i!=pi.items.end(); ++i)
    {
        const Tag::Param *p = *i;
        if(const Tag::ParamParam *param = dynamic_cast<const Tag::ParamParam *> (p))
        {
            Putc('[');
            Dump(param->name);
            Putc('=');
            Dump(param->value);
            Putc(']');
        }
        else if(const Tag::ParamKey *key = dynamic_cast<const Tag::ParamKey *> (p))
        {
            Putc('[');
            Dump(key->name);
            Putc(']');
        }
    }
    */
}

void Page::Dumper::puts(const wstring &s) const
{
    char *input = (char *) (const_cast<ucs4 *> (s.data()));
    size_t left = s.size() * sizeof(ucs4);
    while(left > 0)
    {
        char OutBuf[4096], *outptr = OutBuf;
        size_t outsize = sizeof OutBuf;
    #if DEBUG
        fprintf(stderr, "P1:Converting %u bytes to %u bytes space\n", left, outsize);
        size_t bytesread = left, converted = outsize;
    #endif
        size_t retval = iconv(converter, &input, &left, &outptr, &outsize);
    #if DEBUG
        bytesread -= left; converted -= outsize;
        fprintf(stderr, "%u bytes read, %u bytes left, %u bytes generated, %u bytes  space left\n", bytesread, left, converted, outsize);
    #endif
        fwrite(OutBuf, 1, outptr-OutBuf, stdout);
        if(retval == (size_t)-1)
        {
#if DEBUG
            perror("iconv");
#endif
            if(errno == E2BIG)
            {
                continue;
            }
            if(errno == EILSEQ)
            {
                input += sizeof(ucs4);
                left -= sizeof(ucs4);
                putchar('?');
            }
            if(errno == EINVAL)
            {
                /* Got partial byte and the sequence terminates after that */
                putchar('?');
                return;
            }
        }
    }
}


void Page::Dumper::SetSet(const char *setname)
{
    if(verbose >= 1)
    {
        fprintf(stderr, "Recoding %s to output (%s)\n", midset, setname);
        fflush(stderr);
    }
    iconv_close(converter);
    iconv_close(tester);
    charset = setname;
    if(OpenConv(converter, setname, midset)
    || OpenConv(tester,    setname, midset))
        exit(EINVAL);
}


wstring Page::htmlencode(const wstring &s) const
{
    wstring res;
    unsigned a, b;
    for(a=b=0; a<s.size(); ++a)
    {
        if(s[a]=='<' && !underquote) /* Must be encoded: Could start a tag otherwise. */
        {
            if(a>b)res += s.substr(b, a-b);
            res += "&lt;";
            b = a+1; continue;
        }
        if(s[a]=='"' && underquote == '"') /* Must be encoded: Could end a parameter otherwise. */
        {
            if(a>b)res += s.substr(b, a-b);
            res += "&quot;";
            b = a+1; continue;
        }
        if(s[a]=='\'' && underquote == '\'') /* Must be encoded: Could end a parameter otherwise. */
        {
            if(a>b)res += s.substr(b, a-b);
            res += "&#39;";
            b = a+1; continue;
        }
        if(s[a]=='&') /* Must be encoded: Could start an entity otherwise. */
        {
            if(a>b)res += s.substr(b, a-b);
            res += "&amp;";
            b = a+1; continue;
        }
        if(s[a]==160) /* Not necessary, but nice: &nbsp; is commonly known. */
        {
            /* FIXME: invent a better condition here */
            if(!strict)
            {
                if(a>b)res += s.substr(b, a-b);
                res += "&nbsp;";
                b = a+1; continue;
            }
        }
        if(lossless && !CanDump(s[a]))
        {
            char Buf[64];
            if(usehex)
                sprintf(Buf, "&#x%X;", s[a]);
            else
                sprintf(Buf, "&#%u;", s[a]);
            if(a>b)res += s.substr(b, a-b);
            res += Buf;
            b = a+1; continue;
        }
    }
    if(a>b)res += s.substr(b, a-b);
    return res;
}

wstring Page::htmldecode(const wstring &s) const
{
    wstring res;
    unsigned a, b;
    for(a=b=0; a<s.size(); )
    {
        if(s[a] != '&')
        {
            ++a;
            continue;
        }
        ucs4 specialchar = 0;
        unsigned c=a+1, e=0;
        if(c < s.size() && s[c] == '#')
        {
            ++c;
            if(c < s.size() && s[c] == 'x')
            {
                for(e=2; ++c < s.size(); ++e)
                {
                    if(s[c] >= '0' && s[c] <= '9') specialchar = specialchar*16+ (s[c]-'0');
                    else if(s[c] >= 'A' && s[c] <= 'F') specialchar = specialchar*16+ (s[c]-'A'+10);
                    else if(s[c] >= 'a' && s[c] <= 'f') specialchar = specialchar*16+ (s[c]-'a'+10);
                    else break;
                }
                goto AddChar;
            }
            for(e=1; c < s.size(); ++e, ++c)
            {
                if(s[c] >= '0' && s[c] <= '9') specialchar = specialchar*10+ (s[c]-'0');
                else break;
            }
            goto AddChar;
        }
        for(e=0; (c < s.size() && s[c]!=';'); ++e, ++c)
        {
            if(s[c] >= '0' && s[c] <= '9') { if(!e)break; continue; }
            if(s[c] >= 'A' && s[c] <= 'Z')continue;
            if(s[c] >= 'a' && s[c] <= 'z')continue;
            if(s[c] == '.')continue;
            break;
        }
        {wstring entname = s.substr(a+1, e);
         specialchar = FindEntity(entname);
        }
        if(specialchar != ilseq)
        {
AddChar:    res += s.substr(b, a-b);
            res += specialchar;
            a += e+1;
            if(a == s.size())break;
            if(s[a] == ';')
                ++a;
            else
            {
                /* FIXME: What if &abc; -code did not have ';' ? */
            }
            b = a;
            continue;
        }
        /* Unrecognized &..; -code. Don't parse it. */
        ++a;
        continue;
    }
    if(a > b) res += s.substr(b, a-b);
    return res;
}


void Page::Dump() const
{
    if(signature)
    {
        if(Dumper.isok(ucsig))
            Putc(ucsig);
        else
        {
            fprintf(stderr,
                "Warning: Target encoding can't express unicode signature character. Not signing.\n");
        }
    }
    
    for(unsigned a=0; a<Structure.size(); ++a)
    {
        const Element *e = Structure[a];
        if(const ElemBody *elem = dynamic_cast<const ElemBody *> (e))
        {
            DumpHTML(elem->body);
        }
        else if(const ElemTag *elem = dynamic_cast<const ElemTag *> (e))
        {
            DumpTag(elem->tag);
        }
        else if(const ElemRaw *elem = dynamic_cast<const ElemRaw *> (e))
        {
            DumpRaw(elem->data);
        }
        else if(const ElemPI *elem = dynamic_cast<const ElemPI *> (e))
        {
            DumpPI(elem->pi);
        }
    }
}

void Page::SetOut(const char *outset)
{
    Dumper.SetSet(outset);

    for(unsigned a=0; a<Structure.size(); ++a)
    {
        Element *e = Structure[a];
        if(ElemTag *elem = dynamic_cast<ElemTag *> (e))
        {
            struct Tag &tag = elem->tag;
            if(!tag.Is("META"))
            {
                /* We are only interested in meta-tags. */
                continue;
            }
            
            Tag::iterator i;

            if(!tag.HasParamCalled("HTTP-EQUIV")
            || !IsEqual(tag.GetParamValue("HTTP-EQUIV"), "CONTENT-TYPE")) continue;
            
            wstring &s = tag.GetParamValue("CONTENT");
               
            wstring tmp; tmp += "charset=";
            size_t a = s.find(tmp);
            if(a == s.npos) { continue; }
            a += 8;
            
            tmp = s.substr(0, a);
            tmp += outset;
            s = tmp;
        }
        else if(ElemPI *elem = dynamic_cast<ElemPI *> (e))
        {
            struct PI &pi = elem->pi;
            if(pi.Is("?XML"))
            {
                if(!pi.HasParamCalled("ENCODING")) continue;
                
                wstring tmp; tmp += outset;
                pi.ReplaceParam("ENCODING", tmp);
                
                pi.Reconstruct();
            }
        }
    }
}

void Page::FilterText(wstring (*proc)(const wstring &))
{
    for(unsigned a=0; a<Structure.size(); ++a)
    {
        Element *e = Structure[a];
        if(ElemBody *elem = dynamic_cast<ElemBody *> (e))
        {
            wstring &body = elem->body;
            body = proc(body);
        }
        else if(ElemTag *elem = dynamic_cast<ElemTag *> (e))
        {
            struct Tag &tag = elem->tag;
            if(tag.Is("IMG"))
            {
                Tag::iterator i;
                for(i=tag.items.begin(); i!=tag.items.end(); ++i)
                {
                    Tag::Param *p = *i;
                    if(Tag::ParamParam *param = dynamic_cast<Tag::ParamParam *> (p))
                        if(IsEqual(param->name, "ALT"))
                            param->value = proc(param->value);
                }
            }
            else if(tag.Is("A"))
            {
                Tag::iterator i;
                for(i=tag.items.begin(); i!=tag.items.end(); ++i)
                {
                    Tag::Param *p = *i;
                    if(Tag::ParamParam *param = dynamic_cast<Tag::ParamParam *> (p))
                        if(IsEqual(param->name, "TITLE"))
                            param->value = proc(param->value);
                }
            }
        }
        else
        {
            // Nothing interesting in ElemRaw
            // Are there other types of elements?
        }
    }
}
    
static void ParseInConv(FILE *fp, const char *inset, int fd)
{
    /* Deallocate all possible unnecessary resources */
    if(fileno(fp) != 0 && fd != 0) close(0);
    if(fileno(fp) != 1 && fd != 0) close(1);
    chdir("/");
    //signal(SIGHUP, _exit); - probably unsafe and redundant
    
    iconv_t converter = iconv_open(midset, inset);
    if(converter == (iconv_t)(-1))
    {
        perror("iconv_open");
        _exit(1);
    }
    
    char InBuf[4096];
    char OutBuf[4096];
    
    char *bufptr = InBuf;
    size_t bytes = 0;
    
    for(;;)
    {
        size_t code = fread(bufptr+bytes, 1, sizeof InBuf - bytes, fp);
        if(code <= 0)
        {
            if(!bytes)break;
        }
        else
            bytes += code;

    ReCode:
        char *outptr = OutBuf;
        size_t outsize = sizeof OutBuf;
        
        bool needspace = false;
        bool gotilseq = false;
    #if DEBUG
        fprintf(stderr, "P2:Converting %u bytes to %u bytes space\n", bytes, outsize);
        size_t bytesread = bytes;
    #endif
        size_t converted = outsize;
        size_t retval = iconv(converter, &bufptr, &bytes, &outptr, &outsize);
        converted -= outsize;
    #if DEBUG
        bytesread -= bytes;
    #endif
        write(fd, OutBuf, converted);
        
        if(retval == (size_t)-1)
        {
    #if DEBUG
            perror("iconv");
    #endif
            if(errno == E2BIG)
            {
                needspace = true;
            }
            if(errno == EILSEQ)
            {
                gotilseq = true;
            }
            if(errno == EINVAL)
            {
                /* Got partial byte and the sequence terminates after that */
                if(code == 0)
                {
                    /* It's an error if we're at eof */
                    gotilseq = true;
                }
            }
        }
    #if DEBUG
        fprintf(stderr, "%u bytes read, %u bytes left, %u bytes generated, %u bytes space left\n", bytesread, bytes, converted, outsize);
        fflush(stderr);
    #endif
        if(gotilseq)
        {
            write(fd, &ilseq, sizeof(ilseq));
            /* Skip the invalid byte */
            --bytes; ++bufptr;
        }
        
        if(needspace)
        {
            /* No need to retry reading, just want more space */
            goto ReCode;
        }
        
        memmove(&InBuf[0], bufptr, bytes);
        bufptr = InBuf;
    }
    
    iconv_close(converter);
}

void Page::Parse(FILE *fp, const char *charset)
{
    string inset = charset;
ReHandle:
    if(verbose >= 1)
    {
        fprintf(stderr, "Recoding input (%s) to %s\n", inset.c_str(), midset);
        fflush(stderr);
    }
    int pip[2]; pipe(pip);
    int pid = fork();
    if(!pid)
    {
        close(pip[0]);
        ParseInConv(fp, inset.c_str(), pip[1]);
        _exit(0);
    }
    close(pip[1]);
    FILE *pipfp = fdopen(pip[0], "rb");
    
    Structure.clear();
    ParseUCS4(pipfp);
    
    fclose(pipfp);
    kill(pid, SIGHUP);
    waitpid(pid, NULL, 0);

    set<wstring> newcharset;
    for(unsigned a=0; a<Structure.size(); ++a)
    {
        const Element *e = Structure[a];
        if(const ElemTag *elem = dynamic_cast<const ElemTag *> (e))
        {
            const struct Tag &tag = elem->tag;
            if(!tag.Is("META"))
            {
                /* We are only interested in meta-tags. */
                continue;
            }
            
            if(!tag.HasParamCalled("HTTP-EQUIV")
            || !IsEqual(tag.GetParamValue("HTTP-EQUIV"), "CONTENT-TYPE")) continue;
               
            wstring s = tag.GetParamValue("CONTENT");

            wstring tmp; tmp += "charset=";
            size_t a = s.find(tmp);
            if(a == s.npos) { continue; }
            
            wstring way = s.substr(a+8);
            for(a=0; a<way.size(); ++a)
                if(way[a] < 0x100)
                    way[a] = toupper(way[a]);
            newcharset.insert(way);
        }
        else if(const ElemPI *elem = dynamic_cast<const ElemPI *> (e))
        {
            const struct PI &pi = elem->pi;
            if(pi.Is("?XML"))
            {
                if(!pi.HasParamCalled("ENCODING")) continue;
                
                wstring way = pi.GetParamValue("ENCODING");
                for(unsigned a=0; a<way.size(); ++a)
                    if(way[a] < 0x100)
                        way[a] = toupper(way[a]);
                newcharset.insert(way);
            }
        }
    }
    
    if(newcharset.size() > 0)
    {
        if(newcharset.size() > 1)
        {
            fprintf(stderr, "Error: The document is schizophrenic and claims to be encoded in various ways:");
            set<wstring>::const_iterator i;
            size_t c=newcharset.size();
            for(i=newcharset.begin(); i!=newcharset.end(); ++i)
            {
                string s = Stringify(*i);
                fprintf(stderr, " %s%s",
                    s.c_str(),
                    (--c==1 ? " and" : c>1 ? "," : ".\n"));
            }
            return;
        }
        wstring newset = *newcharset.begin();
        
        if(!IsEqual(newset, inset.c_str()))
        {
            inset = Stringify(newset);
            
            if(verbose >= 0)
            {
                fprintf(stderr,
                    "Warning: Document character encoding seems to be %s, which differs from what you specified. Rereading.\n",
                    inset.c_str());
            }
            
            if(fseek(fp, 0, SEEK_SET) == -1)
            {
                if(errno == ESPIPE
                || errno == EBADF)
                {
                    fprintf(stderr, "Error: stdin is not seekable. Can not reread! You should use the -I%s option. Or perhaps you committed a Useless Use Of Cat.\n",
                        inset.c_str());
                    return;
                }
                else
                    perror("fseek");
            }
           
            goto ReHandle;
        }
    }
}

void Page::ParseUCS4(FILE *fp)
{
    wstring body, comment, rawcontent;
    
    enum states
    {
        stBody,
        stTagName,
        stTagSpace,
        stTagSGML,
        stTagSGMLquoted,
        stTagSGMLquoted2,
        stTagParam,
        stTagParamValue,
        stTagParamValueQuoted,
        stTagParamValueQuoted2,
#if 1
        stRawContent,
        stMaybeEndRawContent,
        stRawContentTagName,
#endif
        stMaybeComment,
        stComment,
        stMaybeEndComment,
        stXMLpiname,
        stXMLpi,
        stXMLpi2
    } state = stBody;
    
    wstring ParamSpace;
    wstring ParamName;
    wstring ParamValue;
    wstring RawCurrentTag;
    wstring ParamContent;
    Tag Tag;
    
    bool firstbyte = true;
    for(;;)
    {
        int c = Getc(fp);
        if(feof(fp))break;
        
        if(firstbyte && c == ucsig)
        {
            if(!signature)
                fprintf(stderr,
                    "Warning: Found an unicode signature. Will put one to the output too.\n");
            signature = true;
            continue;
        }
        
        firstbyte = false;
        
Statechange:
        if(ParamSpace.size() && state != stTagSpace)
        {
            Tag.AddSpace(ParamSpace);
            CLEARSTR(ParamSpace);
        }
        //fprintf(stderr, "State=%d, c=%c\n", state, c);
        switch(state)
        {
            case stBody:
                if(c != '<')
                {
                    body += c;
                    break;
                }
                if(body.size())
                {
                    push_back(new ElemBody(htmldecode(body)));
                    CLEARSTR(body);
                }
                
                Tag.clear();
                state = stTagName;
                break;

            case stTagName:
                if(c == '?' && !Tag.Name.size())
                {
                    state = stXMLpiname;
                    CLEARSTR(ParamName);
                    goto Statechange;
                }
                // Note: this code is almost duplicated in stRawContentTagName
                if((c == '!' && !Tag.Name.size())
                || (c == '/' && !Tag.Name.size())
                || (c >= 'A' && c <= 'Z')
                || (c >= 'a' && c <= 'z')
                || (c >= '0' && c <= '9' && Tag.Name.size())
                || c == '_'
                || c == '.'
                || c == ':' /* No - here, it breaks comments */
                  )
                {
                    Tag.Name += c;
                    break;
                }
                CLEARSTR(ParamName);
                state = stTagSpace;
                goto Statechange;
                
            case stTagSpace:
                if(c == '-')
                {
                    state = stMaybeComment;
                    break;
                }
                if(c == '>')
                {
                    if(ParamSpace.size())
                    {
                        Tag.AddSpace(ParamSpace);
                        CLEARSTR(ParamSpace);
                    }
                    if(ParamName.size())
                    {
                        Tag.SetKey(ParamName);
                        CLEARSTR(ParamName);
                    }
                    push_back(new ElemTag(Tag));
                    
                    state = stBody;
                    
                    if(!strict)
                    {
                        if(Tag.Is("SCRIPT")
                        || Tag.Is("STYLE"))
                        {
                            RawCurrentTag = Tag.Name;
                            RawCurrentTag.insert(0, 1, (ucs4)'/');
                            state = stRawContent;
                        }
                    }

                    Tag.clear();
                    
                    CLEARSTR(body);
                    break;
                }
                if(Tag.Name.size() && Tag.Name[0] == '!')
                {
                    if(c != ' ' && c != '\t' && c != '\v' && c != '\r' && c != '\n')
                    {
                        if(ParamName.size())
                            ParamName += ' ';
                        state = stTagSGML;
                        goto Statechange;
                    }
                }
                
                if(c == '/')
                {
                    CLEARSTR(ParamSpace);
                    Tag.terminating = true;
                    break;
                }
                
                if((c >= 'A' && c <= 'Z')
                || (c >= 'a' && c <= 'z'))
                {
                    state = stTagParam;
                    goto Statechange;
                }
                
                ParamSpace += c;
                break;
            
            case stTagSGML:
                if(c == '-')
                {
                    state = stMaybeComment;
                    break;
                }
                if(c == '>')
                {
                    state = stTagSpace;
                    goto Statechange;
                }
                ParamName += c;
                if(c == '"')
                    state = stTagSGMLquoted;
                else if(c == '\'')
                    state = stTagSGMLquoted2;
                break;
            
            case stTagSGMLquoted:
                ParamName += c;
                if(c == '"')
                    state = stTagSGML;
                break;

            case stTagSGMLquoted2:
                ParamName += c;
                if(c == '\'')
                    state = stTagSGML;
                break;

            case stTagParam:
                if((c >= 'A' && c <= 'Z')
                || (c >= 'a' && c <= 'z')
                || c == ':'
                || c == '.'
                || c == '_'
                || c == '-'
                  )
                {
                    ParamName += c;
                    break;
                }

                if(c == '=')
                {
                    CLEARSTR(ParamValue);
                    state = stTagParamValue;
                    break;
                }

                Tag.SetKey(ParamName);
                CLEARSTR(ParamName);
                state = stTagSpace;
                goto Statechange;
            
            case stTagParamValue:
                if(!ParamValue.size())
                {
                    if(c == '"')
                    {
                        state = stTagParamValueQuoted;
                        break;
                    }
                    if(c == '\'')
                    {
                        state = stTagParamValueQuoted2;
                        break;
                    }
                }
                
                if((c >= 'A' && c <= 'Z')
                || (c >= 'a' && c <= 'z')
                || (c >= '0' && c <= '9')
                || c == '.' || c == '-' /* .-_: are allowed by sgml */
                || c == '_' || c == ':')
                {
                    ParamValue += c;
                    break;
                }
                else if(!strict
                    && (c!=' ' && c!='\t' && c!='\n' && c!='\r' && c!='\v' && c!='>')
                       ) /* generated by many programs :-/ */
                {
                    ParamViolations++;
                    ParamValue += c;
                    break;
                }
                Tag.SetParam(ParamName, htmldecode(ParamValue));
                CLEARSTR(ParamName);
                CLEARSTR(ParamValue);
                state = stTagSpace;
                goto Statechange;
            
            case stTagParamValueQuoted:
                if(c == '"')
                {
                    Tag.SetParam(ParamName, htmldecode(ParamValue));
                    CLEARSTR(ParamName);
                    CLEARSTR(ParamValue);
                    state = stTagSpace;
                    break;
                }
                ParamValue += c;
                break;

            case stTagParamValueQuoted2:
                if(c == '\'')
                {
                    Tag.SetParam(ParamName, htmldecode(ParamValue));
                    CLEARSTR(ParamName);
                    CLEARSTR(ParamValue);
                    state = stTagSpace;
                    break;
                }
                ParamValue += c;
                break;

            case stMaybeComment:
                if(c == '-')
                {
                    state = stComment;
                    CLEARSTR(comment);
                    break;
                }
                
                ParamName += '-';
                
                state = stTagSpace;
                goto Statechange;
            
            case stComment:
                if(c == '-')
                {
                    state = stMaybeEndComment;
                    break;
                }
                comment += c;
                break;
            
            case stMaybeEndComment:
                if(c == '-')
                {
                    if(comment.size())
                    {
                        Tag.AddComment(comment);
                        CLEARSTR(comment);
                    }
                    
                    state = stTagSpace;
                    break;
                }
                comment += '-';
                state = stComment;
                goto Statechange;
#if 1
            case stRawContent:
                // end with </
                if(c == '<')
                {
                    state = stMaybeEndRawContent;
                    break;
                }
                rawcontent += c;
                break;
            
            case stXMLpiname:
                if(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v')
                {
                    CLEARSTR(ParamContent);
                    state = stXMLpi;
                    break;
                }
                ParamName += c;
                break;

            case stXMLpi:
                if(c == '?')
                {
                    state = stXMLpi2;
                    break;
                }
                ParamContent += c;
                break;
                
            case stXMLpi2:
                if(c != '>')
                {
                    ParamContent += '?';
                    // not end of tag
                    state = stXMLpi;
                    goto Statechange;
                }
                
                push_back(new ElemPI(PI(ParamName, ParamContent)));
                
                Tag.clear();
                CLEARSTR(ParamContent);
                
                // tag ends here
                xmlmode = true;
                state = stBody;
                break;

            case stMaybeEndRawContent:
                if(c == '/')
                {
                    CLEARSTR(Tag.Name);
                    state = stRawContentTagName;
                    goto Statechange;
                }
                rawcontent += '<';
                state = stRawContent;
                goto Statechange;
            
            case stRawContentTagName:
                // Note: this code is duplicate from stTagName
                if((c == '!' && !Tag.Name.size())
                || (c == '/' && !Tag.Name.size())
                || (c >= 'A' && c <= 'Z')
                || (c >= 'a' && c <= 'z')
                || (c >= '0' && c <= '9' && Tag.Name.size())
                || (c == '_' || c == '.'
                 || c == ':') /* No - here, it breaks comments */
                  )
                {
                    Tag.Name += c;
                    break;
                }
                
                if(Tag.Is(RawCurrentTag))
                {
                    if(rawcontent.size())
                    {
                        push_back(new ElemRaw(rawcontent));
                        CLEARSTR(rawcontent);
                    }
                    
                    CLEARSTR(ParamName);
                    state = stTagSpace;
                    goto Statechange;
                }
                rawcontent += '<';
                rawcontent += Tag.Name;
                state = stRawContent;
                goto Statechange;
#endif
        }
    }
    if(body.size())
    {
        push_back(new ElemBody(htmldecode(body)));
        CLEARSTR(body);
    }
    if(rawcontent.size())
    {
        push_back(new ElemRaw(rawcontent));
        CLEARSTR(rawcontent);
    }
    /* Don't add a broken tag to the end. */
}

#ifdef GXBLT
/* This is a support for altering the text content */
static wstring testifiltteri(const wstring &var)
{
    wstring res;
    bool found = false;
    for(unsigned a=0; a<var.size(); ++a)
    	if(var[a] != ' '
    	&& var[a] != '\n'
    	&& var[a] != '\r'
    	&& var[a] != '\t')
    	{
    		found = true;
    		break;
    	}
    if(!found) return var;
    
    for(unsigned a=0; a<5; ++a) res += "gxblt"[a];
/*
    // res += '[';
    res += var;
    // res += ']';
*/
    return res;
}
#endif

#include <argh.hh>

int main(int argc, const char *const *argv)
{
    string outset = "iso-8859-1";
    string inset = "iso-8859-1";
    
    ParamHandler Argh;
    Argh.AddLong("inset",   'I').SetString().SetDesc("Assumed input character set (default: "+inset+")", "setname");
    Argh.AddLong("outset",  'O').SetString().SetDesc("Wanted output character set (default: "+outset+")", "setname");
    Argh.AddLong("help",    'h').SetBool().SetDesc("This help.");
    Argh.AddLong("lossy",   'l').SetBool().SetDesc("Disable lossless conversion.");
    Argh.AddLong("usehex",  'e').SetBool().SetDesc("Use hexadecimal escapes.");
    Argh.AddLong("version", 'V').SetBool().SetDesc("Displays version information.");
    Argh.AddLong("strict",  's').SetBool().SetDesc("Turn off support for slightly broken HTML.");
    Argh.AddLong("verbose", 'v').SetBool().SetDesc("Be less quiet.");
    Argh.AddLong("quiet",   'q').SetBool().SetDesc("Be less verbose.");
    Argh.AddLong("xmlmode", 'x').SetBool().SetDesc("XML mode: all tag param values quoted.");
    Argh.AddLong("signature",'g').SetBool().SetDesc("Prefix the file with an unicode signature.");

    Argh.StartParse(argc, argv);
    for(;;)
    {
        long c = Argh.GetParam();
        if(c == -1)break;
        switch(c)
        {
            case 'V': printf("%s\n", VERSION); return 0;
            case 'I': inset = Argh.GetString(); break;
            case 'O': outset = Argh.GetString(); break;
            case 'l': lossless = !Argh.GetBool(); break;
            case 'e': usehex = Argh.GetBool(); break;
            case 's': strict = Argh.GetBool(); break;
            case 'v': verbose += Argh.GetBool() ? 1 : -1; break;
            case 'q': verbose -= Argh.GetBool() ? 1 : -1; break;
            case 'x': xmlmode = Argh.GetBool(); break;
            case 'g': signature = Argh.GetBool(); break;
            case 'h':
                printf(
                    "htmlrecode " VERSION " - Copyright (C) 1992,2003 Bisqwit (http://iki.fi/bisqwit/)\n"
                    "\n"
                    "Usage: htmlrecode [<option> [<...>]]\n"
                    "\n"
                    "Reads stdin, writes stdout.\n"
                    "\nOptions:\n");
                Argh.ListOptions();
                printf("\n"
                    "Pipe in the html file and pipe the output to result file.\n");
                return 0;
            default:
                // TODO
                break;
        }
    }
    if(!Argh.ok())return -1;

    Page p;

    rewind(stdin);
    p.Parse(stdin, inset.c_str());
    fclose(stdin);
    
    /* This is a support for altering the text content */
#ifdef GXBLT
    p.FilterText(testifiltteri);
#endif
    
    p.SetOut(outset.c_str());
    p.Dump();
    
    if(FixedStyleScript && verbose >= 1)
    {
        fprintf(stderr,
            "Warning: Fixed %u SCRIPT/STYLE block%s that %sn't properly hidden with HTML comments. Be grateful.\n",
                FixedStyleScript,
                FixedStyleScript==1 ? "" : "s",
                FixedStyleScript==1 ? "was" : "were");
    }
    if(ParamViolations && verbose >= 1)
    {
        fprintf(stderr,
            "Warning: Fixed %u broken (incorrectly unquoted) tag parameter%s. Be grateful.\n",
                ParamViolations,
                ParamViolations==1 ? "" : "s"
               );
    }
    
    return 0;
}
