sermon/src/format.c

/*
 * format.c
 * Copyright © 2015 David A. Baer
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the organization nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY David A. Baer ''AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL David A. Baer BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <string.h>
#include "queue.h"
#include "stack.h"
#include "utf8.h"
#include "format.h"

DEFINE_QUEUE(FormatElement, FormatElementQueue);

#define EM_DASH_UTF8 "\xe2\x80\x94"

typedef utf8iterator* Tokenizer;
typedef enum {
    TOK_TEXT,
    TOK_GREEK,
    TOK_UNICODE,
    TOK_STAR,
    TOK_REF,
    TOK_URL,
    TOK_BREAK,
    /*
    TOK_DASH,
    TOK_OPEN_DOUBLE_QUOTE,
    TOK_CLOSE_DOUBLE_QUOTE,
    TOK_OPEN_SINGLE_QUOTE,
    TOK_CLOSE_SINGLE_QUOTE,
    */
    TOK_EOF
} TokenType;
typedef struct {
    TokenType toktype;
    char* toktext;
} Token;

static Tokenizer
initializeTokenizer(const char* txt) {
    return utf8NewIterator(txt);
}

static void
freeTokenizer(utf8iterator* iter) {
    utf8FreeIterator(iter);
}

int
greekChar(uint32_t ch) {
    return (((0x370 <= ch) && (ch <= 0x3ff)) ||
            ((0x1f00 <= ch) && (ch <= 0x1fff)));
}

int
extendedPunctuation(uint32_t ch) {
    return ((0x2000 <= ch) && (ch <= 0x206f));
}

int
latinChar(uint32_t ch) {
    return (ch <= 0xff) || extendedPunctuation(ch);
}

int
httpAt(Tokenizer tokenizer) {
    return ((tolower(tokenizer->txt[tokenizer->byteIndex]) == 'h') &&
            (tolower(tokenizer->txt[tokenizer->byteIndex + 1]) == 't') &&
            (tolower(tokenizer->txt[tokenizer->byteIndex + 2]) == 't') &&
            (tolower(tokenizer->txt[tokenizer->byteIndex + 3]) == 'p') &&
            (((tokenizer->txt[tokenizer->byteIndex + 4] == ':') &&
              (tokenizer->txt[tokenizer->byteIndex + 5] == '/') &&
              (tokenizer->txt[tokenizer->byteIndex + 6] == '/')) ||
             ((tolower(tokenizer->txt[tokenizer->byteIndex + 4]) == 's') &&
              (tokenizer->txt[tokenizer->byteIndex + 5] == ':') &&
              (tokenizer->txt[tokenizer->byteIndex + 6] == '/') &&
              (tokenizer->txt[tokenizer->byteIndex + 7] == '/'))));
}

static Token
nextToken(Tokenizer tokenizer) {
    int startIndex = tokenizer->byteIndex;
    uint32_t ch = utf8CharAt(tokenizer);
    Token result;
    memset(&result, 0, sizeof(result));
    if (ch == '\0') {
        result.toktype = TOK_EOF;
        result.toktext = NULL;
        return result;
    } else if (ch == '*') {
        utf8Advance(tokenizer);
        result.toktype = TOK_STAR;
        result.toktext = NULL;
        return result;
    } else if (ch == '\n') {
        utf8Advance(tokenizer);
        result.toktype = TOK_BREAK;
        result.toktext = NULL;
        return result;
    } else if (greekChar(ch)) {
        while ((ch != 0) &&
               (greekChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.'))) {
            utf8Advance(tokenizer);
            ch = utf8CharAt(tokenizer);
        }
        result.toktype = TOK_GREEK;
        result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
        return result;
    } else if ((ch == '^') && (tokenizer->txt[tokenizer->byteIndex + 1] == '{')) {
        int idStart = 0, idEnd = 0;
        utf8Advance(tokenizer); /* to { */
        utf8Advance(tokenizer); /* to id */
        ch = utf8CharAt(tokenizer);
        idStart = tokenizer->byteIndex;
        while ((ch != 0) && (ch != '}')) {
            utf8Advance(tokenizer);
            ch = utf8CharAt(tokenizer);
        }
        idEnd = tokenizer->byteIndex;
        /* reached end-of-string or } */
        if (ch == '}')
            utf8Advance(tokenizer);
        result.toktype = TOK_REF;
        result.toktext = strndup(tokenizer->txt + idStart, idEnd - idStart);
        return result;
    } else if (httpAt(tokenizer)) {
        int endIndex = 0;
        while ((ch != 0) && (ch != ' ') && (ch != '\r') && (ch != '\n')) {
            utf8Advance(tokenizer);
            ch = utf8CharAt(tokenizer);
        }
        if (tokenizer->txt[tokenizer->byteIndex - 1] == '.') {
            /* heuristic: url doesn't end in . */
            endIndex = --tokenizer->byteIndex;
        } else {
            endIndex = tokenizer->byteIndex;
        }

        result.toktype = TOK_URL;
        result.toktext = strndup(tokenizer->txt + startIndex, endIndex - startIndex);
        return result;
    } else if (latinChar(ch)) {
        while ((ch != 0) && latinChar(ch) && (ch != '*') && (ch != '\n')) {
            utf8Advance(tokenizer);
            ch = utf8CharAt(tokenizer);
            if (ch == '^') {
                if (tokenizer->txt[tokenizer->byteIndex + 1] == '{') break;
            } else if (httpAt(tokenizer)) {
                break;
            }
        }
        result.toktype = TOK_TEXT;
        result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
        return result;
    } else {
        while ((ch != 0) && (!latinChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.') || extendedPunctuation(ch))) {
            utf8Advance(tokenizer);
            ch = utf8CharAt(tokenizer);
        }
        result.toktype = TOK_UNICODE;
        result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
        return result;
    }
}

DEFINE_STACK(FormatElementQueue, FEQueueStack);

int formatText(const char* txt, FormatElement** dst, CitationRecordQueue* citationQPtr) {
    Tokenizer tokenizer = initializeTokenizer(txt);
    Token tok;
    int listLength = 0, em = 0;
    NEW_QUEUE(FormatElementQueue, formatElementQ);
    NEW_STACK(FEQueueStack, st);
    while ((tok = nextToken(tokenizer)).toktype != TOK_EOF) {
        if (tok.toktype == TOK_STAR) {
            if (em) { /* end emphasis */
                FormatElement* content = NULL;
                FormatElement elt = { .elementType = FORMAT_EM };
                QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
                elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
                elt.elementContent.nestedContent = content;
                formatElementQ = STACK_HEAD(st);
                APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
                POP_STACK(FEQueueStack, st);
                em = 0;
            } else { /* begin emphasis */
                PUSH_STACK(FEQueueStack, st, formatElementQ);
                REINIT_QUEUE(formatElementQ);
                em = 1;
            }
        } else if (tok.toktype == TOK_BREAK) {
            FormatElement elt = { .elementType = FORMAT_BR, .elementContentLength = 0,
                                  .elementContent = { .textContent = NULL } };
            APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
        } else {
            FormatElementType t;
            FormatElement elt = { .elementContent = { .textContent = tok.toktext } } ;
            if (tok.toktype == TOK_TEXT) {
                t = FORMAT_TEXT;
            } else if (tok.toktype == TOK_GREEK) {
                t = FORMAT_GREEK;
            } else if (tok.toktype == TOK_UNICODE) {
                t = FORMAT_UNICODE;
            } else if (tok.toktype == TOK_URL) {
                t = FORMAT_URL;
            } else if (tok.toktype == TOK_REF) {
                t = FORMAT_CITATION;
                if (citationQPtr && !lookupCitation(*citationQPtr, tok.toktext)) {
                    addCitation(citationQPtr, tok.toktext);
                }
            }
            elt.elementType = t;
            APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
        }
    }
    if (em) {
        /* unmatched star -- close the emphasis here */
        FormatElement* content = NULL;
        FormatElement elt = { .elementType = FORMAT_EM };
        QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
        elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
        elt.elementContent.nestedContent = content;
        formatElementQ = STACK_HEAD(st);
        APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
        POP_STACK(FEQueueStack, st);
    }

    QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, *dst);
    listLength = QUEUE_LENGTH(formatElementQ);
    DESTROY_QUEUE(FormatElementQueue, formatElementQ);
    freeTokenizer(tokenizer);
    return listLength;
}

void freeFormatElementArray(FormatElement* a, int length) {
    int i = 0;
    for (i = 0; i < length; i++) {
        if (a[i].elementType == FORMAT_EM) {
            freeFormatElementArray(a[i].elementContent.nestedContent,
                                   a[i].elementContentLength);
        } else {
            free(a[i].elementContent.textContent);
        }
    }
    free(a);
}