/***************************************************************************
 *   Copyright (C) 2004-2012 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.4.8. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "document.h"

#include "cb2bib_utilities.h"
#include "settings.h"

#include <QProcess>


document::document(const QString& fn, const Conversion mode)
{
    _document_fn = fn.trimmed();
    settings* settingsP = settings::instance();
    _converter_output = settingsP->tempPath() + "/cb2bib_document_conversion_tmp_" + settingsP->applicationPid();
    setConverter(mode);
}


QString document::fileToString(const QString& fn)
{
    _document_fn = fn.trimmed();
    return toString();
}

QString document::toString()
{
    _error_string.clear();
    _log_string.clear();
    if (_document_fn.isEmpty())
        return QString();
    QFileInfo fbin(_converter_bin);
    if (fbin.isAbsolute())
        if (!fbin.exists() || !fbin.isFile())
        {
            _error_string = QObject::tr("Converter %1 does not exist\n").arg(_converter_bin);
            return QString();
        }
    QFileInfo fi(_document_fn);
    if (!fi.exists() || !fi.isFile())
    {
        _error_string = QObject::tr("File %1 does not exist\n").arg(_document_fn);
        return QString();
    }

    QProcess converter;
    QStringList args(_converter_arg.split(' ', QString::SkipEmptyParts));
    args.append(_document_fn);
    args.append(_converter_output);
    converter.start(_converter_bin, args);
    if (!converter.waitForStarted())
    {
        _error_string += QObject::tr("Converter '%1 %2' could not be started. Check file permissions and path\n")
                         .arg(_converter_bin).arg(_converter_arg);
        return QString();
    }
    if (!converter.waitForFinished(_conversion_mode == Raw ? -1 : 150000))
    {
        converter.kill();
        c2bUtils::warn(QObject::tr("Error: Document conversion timeout for file '%1'").arg(_document_fn));
    }
    QString doc(c2bUtils::fileToString(_converter_output, true));
    _log_string = QString::fromUtf8(converter.readAllStandardOutput().trimmed());
    _log_string += '\n' + QString::fromUtf8(converter.readAllStandardError().trimmed());
    if (converter.exitStatus() != QProcess::NormalExit || doc.isEmpty())
    {
        _error_string += QObject::tr("[%1 %2] Conversion failed for file %3\n")
                         .arg(_converter_bin).arg(_converter_arg).arg(_document_fn);
        c2bUtils::warn(QObject::tr("Warning: Text conversion for file '%1' is empty").arg(_document_fn));
        return QString();
    }

    // Do some processing

    // Remove null chacracters (avoids truncation in toUtf8, djvutxt places them at the end of page)
    doc.replace(QChar(0), ' ');
    // Unify separator
    doc.replace(QChar(8208), '-');
    doc.replace(QChar(8209), '-');
    doc.replace(QChar(8210), '-');
    doc.replace(QChar(8211), '-');
    doc.replace(QChar(8212), '-');
    doc.replace(QChar(8213), '-');
    doc.replace(QChar(8722), '-');
    doc.replace(QChar(65533), '-');
    if (_conversion_mode == Raw)
    {
        // Join hyphenated words
        doc.remove(QRegExp("-\\s*[\\n\\r]\\s*(?=\\w)"));
        doc.replace(QRegExp("[\\n\\r]"), " ");
    }
    // Unify apostrophe
    doc.replace(QChar(8216), '\'');
    doc.replace(QChar(8217), '\'');
    // Unify comma
    doc.replace(QChar(65292), ',');
    // Revert Latin ligatures
    doc.replace(QChar(306), "IJ");
    doc.replace(QChar(307), "ij");
    doc.replace(QChar(64256), "ff");
    doc.replace(QChar(64257), "fi");
    doc.replace(QChar(64258), "fl");
    doc.replace(QChar(64259), "ffi");
    doc.replace(QChar(64260), "ffl");
    doc.replace(QChar(65842), "IJ");
    doc.replace(QChar(65843), "ij");
    if (doc.contains(QChar(174))) // Correct frequent misencoding of "fi" as circle R in PDFs
        doc.replace(QRegExp(QString("%1([a-z])").arg(QChar(174))), "fi\\1");
    // Corrections for pdftotext raw
    doc.replace(QChar(305), 'i');
    doc.replace(QChar(711), ' ');
    const QChar grave(96);
    doc.replace(' ' + grave, " '");
    doc.replace(QString(2, grave), "'");
    doc.replace(grave + 'a', QChar(224));
    doc.replace(grave + 'e', QChar(232));
    doc.replace(grave + 'i', QChar(236));
    doc.replace(grave + 'o', QChar(242));
    doc.replace(grave + 'u', QChar(249));
    doc.replace(grave + 'A', QChar(192));
    doc.replace(grave + 'E', QChar(200));
    doc.replace(grave + 'I', QChar(204));
    doc.replace(grave + 'O', QChar(210));
    doc.replace(grave + 'U', QChar(217));
    doc.replace('a' + grave, QChar(224));
    doc.replace('e' + grave, QChar(232));
    doc.replace('i' + grave, QChar(236));
    doc.replace('o' + grave, QChar(242));
    doc.replace('u' + grave, QChar(249));
    doc.replace('A' + grave + ' ', QChar(192));
    doc.replace('E' + grave + ' ', QChar(200));
    doc.replace('I' + grave + ' ', QChar(204));
    doc.replace('O' + grave + ' ', QChar(210));
    doc.replace('U' + grave + ' ', QChar(217));
    doc.replace('A' + grave, QChar(192));
    doc.replace('E' + grave, QChar(200));
    doc.replace('I' + grave, QChar(204));
    doc.replace('O' + grave, QChar(210));
    doc.replace('U' + grave, QChar(217));
    const QChar dieresis(168);
    doc.replace(dieresis + 'a', QChar(228));
    doc.replace(dieresis + 'e', QChar(235));
    doc.replace(dieresis + 'i', QChar(239));
    doc.replace(dieresis + 'o', QChar(246));
    doc.replace(dieresis + 'u', QChar(252));
    doc.replace(dieresis + 'A', QChar(196));
    doc.replace(dieresis + 'E', QChar(203));
    doc.replace(dieresis + 'I', QChar(207));
    doc.replace(dieresis + 'O', QChar(214));
    doc.replace(dieresis + 'U', QChar(220));
    doc.replace('a' + dieresis, QChar(228));
    doc.replace('e' + dieresis, QChar(235));
    doc.replace('i' + dieresis, QChar(239));
    doc.replace('o' + dieresis, QChar(246));
    doc.replace('u' + dieresis, QChar(252));
    doc.replace('A' + dieresis + ' ', QChar(196));
    doc.replace('E' + dieresis + ' ', QChar(203));
    doc.replace('I' + dieresis + ' ', QChar(207));
    doc.replace('O' + dieresis + ' ', QChar(214));
    doc.replace('U' + dieresis + ' ', QChar(220));
    doc.replace('A' + dieresis, QChar(196));
    doc.replace('E' + dieresis, QChar(203));
    doc.replace('I' + dieresis, QChar(207));
    doc.replace('O' + dieresis, QChar(214));
    doc.replace('U' + dieresis, QChar(220));
    const QChar accute(180);
    doc.replace(accute + 'a', QChar(225));
    doc.replace(accute + 'e', QChar(233));
    doc.replace(accute + 'i', QChar(237));
    doc.replace(accute + 'o', QChar(243));
    doc.replace(accute + 'u', QChar(250));
    doc.replace(accute + 'A', QChar(193));
    doc.replace(accute + 'E', QChar(201));
    doc.replace(accute + 'I', QChar(205));
    doc.replace(accute + 'O', QChar(211));
    doc.replace(accute + 'U', QChar(218));
    doc.replace('a' + accute, QChar(225));
    doc.replace('e' + accute, QChar(233));
    doc.replace('i' + accute, QChar(237));
    doc.replace('o' + accute, QChar(243));
    doc.replace('u' + accute, QChar(250));
    doc.replace('A' + accute + ' ', QChar(193));
    doc.replace('E' + accute + ' ', QChar(201));
    doc.replace('I' + accute + ' ', QChar(205));
    doc.replace('O' + accute + ' ', QChar(211));
    doc.replace('U' + accute + ' ', QChar(218));
    doc.replace('A' + accute, QChar(193));
    doc.replace('E' + accute, QChar(201));
    doc.replace('I' + accute, QChar(205));
    doc.replace('O' + accute, QChar(211));
    doc.replace('U' + accute, QChar(218));
    const QChar circumflex(710);
    doc.replace(circumflex + 'a', QChar(226));
    doc.replace(circumflex + 'e', QChar(234));
    doc.replace(circumflex + 'i', QChar(238));
    doc.replace(circumflex + 'o', QChar(244));
    doc.replace(circumflex + 'u', QChar(251));
    doc.replace(circumflex + 'A', QChar(194));
    doc.replace(circumflex + 'E', QChar(202));
    doc.replace(circumflex + 'I', QChar(206));
    doc.replace(circumflex + 'O', QChar(212));
    doc.replace(circumflex + 'U', QChar(219));
    doc.replace('a' + circumflex, QChar(226));
    doc.replace('e' + circumflex, QChar(234));
    doc.replace('i' + circumflex, QChar(238));
    doc.replace('o' + circumflex, QChar(244));
    doc.replace('u' + circumflex, QChar(251));
    doc.replace('A' + circumflex + ' ', QChar(194));
    doc.replace('E' + circumflex + ' ', QChar(202));
    doc.replace('I' + circumflex + ' ', QChar(206));
    doc.replace('O' + circumflex + ' ', QChar(212));
    doc.replace('U' + circumflex + ' ', QChar(219));
    doc.replace('A' + circumflex, QChar(194));
    doc.replace('E' + circumflex, QChar(202));
    doc.replace('I' + circumflex, QChar(206));
    doc.replace('O' + circumflex, QChar(212));
    doc.replace('U' + circumflex, QChar(219));
    const QChar cedilla(184);
    doc.replace('c' + cedilla, QChar(231));
    doc.replace('C' + cedilla, QChar(199));
    doc.replace(cedilla + 'c', QChar(231));
    doc.replace(cedilla + 'C', QChar(199));
    doc.replace('s' + cedilla, QChar(351));
    doc.replace('S' + cedilla, QChar(350));
    doc.replace(cedilla + 's', QChar(351));
    doc.replace(cedilla + 'S', QChar(350));
    const QChar tilde(732);
    doc.replace('n' + tilde, QChar(241));
    doc.replace('N' + tilde, QChar(209));
    doc.replace(tilde + 'n', QChar(241));
    doc.replace(tilde + 'N', QChar(209));
    doc.replace('a' + tilde, QChar(227));
    doc.replace('A' + tilde, QChar(195));
    doc.replace(tilde + 'a', QChar(227));
    doc.replace(tilde + 'A', QChar(195));
    const QChar ring(730);
    doc.replace('a' + ring, QChar(229));
    doc.replace('A' + ring, QChar(197));
    doc.replace(ring + 'a', QChar(229));
    doc.replace(ring + 'A', QChar(197));
    // poppler
    doc.remove(QChar(826));
    doc.remove(QChar(841));
    doc.remove(QChar(849));
    doc.remove(QChar(850));
    doc.remove(QChar(851));
    doc.remove(QChar(852));
    doc.remove(QChar(853));
    doc.remove(QChar(854));
    doc.remove(QChar(855));
    doc.remove(QChar(856));
    doc.remove(QChar(858));
    doc.remove(QChar(862));
    doc.remove(QChar(873));
    doc.remove(QChar(874));
    doc.remove(QChar(888));
    // Skip control codes
    // Many are improper conversions from PDF to text mathematical symbols
    doc.replace(QChar(127), ' ');
    for (int i = 0; i < doc.length(); ++i)
    {
        QCharRef c = doc[i];
        const int u(c.unicode());
        if (u > 31)
            continue;
        if (u > 8 && u < 14) // tabs, new lines
            continue;
        c = ' ';
    }
    return doc;
}

void document::setConverter(const Conversion mode)
{
    _conversion_mode = mode;
    settings* s(settings::instance());
    switch (_conversion_mode)
    {
    case Raw:
        setConverter(s->fileName("c2bBibSearcher/Pdf2TextBin"), s->value("c2bBibSearcher/Pdf2TextArg").toString());
        break;
    case FirstPage:
    default:
        setConverter(s->fileName("c2bPdfImport/Pdf2TextBin"), s->value("c2bPdfImport/Pdf2TextArg").toString());
        break;
    }
}

void document::setConverter(const QString& bin, const QString& args)
{
    _converter_bin = bin.trimmed();
    _converter_arg = args.simplified();
}
