Use stylesheet transformations to output HTML, for example

2015-08-11 15:33:08 -04:00
parent cfc0ba7e9a
commit 10927bee25
19 changed files with 725 additions and 92 deletions
--- a/src/format.c
+++ b/src/format.c
@@ -0,0 +1,208 @@
+#include <string.h>
+#include "queue.h"
+#include "stack.h"
+#include "utf8.h"
+#include "format.h"
+
+DEFINE_QUEUE(FormatElement, FormatElementQueue);
+
+#define EM_DASH_UTF8 "\xe2\x80\x94"
+
+typedef utf8iterator* Tokenizer;
+typedef enum {
+    TOK_TEXT,
+    TOK_GREEK,
+    TOK_UNICODE,
+    TOK_STAR,
+    TOK_REF,
+    /*
+    TOK_DASH,
+    TOK_OPEN_DOUBLE_QUOTE,
+    TOK_CLOSE_DOUBLE_QUOTE,
+    TOK_OPEN_SINGLE_QUOTE,
+    TOK_CLOSE_SINGLE_QUOTE,
+    */
+    TOK_EOF
+} TokenType;
+typedef struct {
+    TokenType toktype;
+    char* toktext;
+} Token;
+
+static Tokenizer
+initializeTokenizer(const char* txt) {
+    return utf8NewIterator(txt);
+}
+
+static void
+freeTokenizer(utf8iterator* iter) {
+    utf8FreeIterator(iter);
+}
+
+inline int
+greekChar(uint32_t ch) {
+    return (((0x370 <= ch) && (ch <= 0x3ff)) ||
+            ((0x1f00 <= ch) && (ch <= 0x1fff)));
+}
+
+inline int
+extendedPunctuation(uint32_t ch) {
+    return ((0x2000 <= ch) && (ch <= 0x206f));
+}
+
+inline int
+latinChar(uint32_t ch) {
+    return (ch <= 0xff) || extendedPunctuation(ch);
+}
+
+static Token
+nextToken(Tokenizer tokenizer) {
+    int startIndex = tokenizer->byteIndex;
+    uint32_t ch = utf8CharAt(tokenizer);
+    Token result;
+    memset(&result, 0, sizeof(result));
+    if (ch == '\0') {
+        result.toktype = TOK_EOF;
+        result.toktext = NULL;
+        return result;
+    } else if (ch == '*') {
+        utf8Advance(tokenizer);
+        result.toktype = TOK_STAR;
+        result.toktext = NULL;
+        return result;
+    } else if (greekChar(ch)) {
+        while ((ch != 0) &&
+               (greekChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.'))) {
+            utf8Advance(tokenizer);
+            ch = utf8CharAt(tokenizer);
+        }
+        result.toktype = TOK_GREEK;
+        result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
+        return result;
+    } else if ((ch == '^') && (tokenizer->txt[tokenizer->byteIndex + 1] == '{')) {
+        int idStart = 0, idEnd = 0;
+        utf8Advance(tokenizer); /* to { */
+        utf8Advance(tokenizer); /* to id */
+        ch = utf8CharAt(tokenizer);
+        idStart = tokenizer->byteIndex;
+        while ((ch != 0) && (ch != '}')) {
+            utf8Advance(tokenizer);
+            ch = utf8CharAt(tokenizer);
+        }
+        idEnd = tokenizer->byteIndex;
+        /* reached end-of-string or } */
+        if (ch == '}')
+            utf8Advance(tokenizer);
+        result.toktype = TOK_REF;
+        result.toktext = strndup(tokenizer->txt + idStart, idEnd - idStart);
+        return result;
+    } else if (latinChar(ch)) {
+        while ((ch != 0) && latinChar(ch) && (ch != '*')) {
+            utf8Advance(tokenizer);
+            ch = utf8CharAt(tokenizer);
+            if (ch == '^') {
+                if (tokenizer->txt[tokenizer->byteIndex + 1] == '{') break;
+            }
+        }
+        result.toktype = TOK_TEXT;
+        result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
+        return result;
+    } else {
+        while ((ch != 0) && (!latinChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.') || extendedPunctuation(ch))) {
+            utf8Advance(tokenizer);
+            ch = utf8CharAt(tokenizer);
+        }
+        result.toktype = TOK_UNICODE;
+        result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex);
+        return result;
+    }
+}
+
+DEFINE_STACK(FormatElementQueue, FEQueueStack);
+
+int formatText(const char* txt, FormatElement** dst, CitationRecordQueue* citationQPtr) {
+    Tokenizer tokenizer = initializeTokenizer(txt);
+    Token tok;
+    int listLength = 0, em = 0;
+    NEW_QUEUE(FormatElementQueue, formatElementQ);
+    NEW_STACK(FEQueueStack, st);
+    while ((tok = nextToken(tokenizer)).toktype != TOK_EOF) {
+        if (tok.toktype == TOK_STAR) {
+            if (em) { /* end emphasis */
+                FormatElement* content = NULL;
+                FormatElement elt = { .elementType = FORMAT_EM };
+                QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
+                elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
+                elt.elementContent.nestedContent = content;
+                formatElementQ = STACK_HEAD(st);
+                APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
+                POP_STACK(FEQueueStack, st);
+                em = 0;
+            } else { /* begin emphasis */
+                PUSH_STACK(FEQueueStack, st, formatElementQ);
+                REINIT_QUEUE(formatElementQ);
+                em = 1;
+            }
+        } else {
+            FormatElementType t;
+            FormatElement elt = { .elementContent = { .textContent = tok.toktext } } ;
+            if (tok.toktype == TOK_TEXT) {
+                t = FORMAT_TEXT;
+            } else if (tok.toktype == TOK_GREEK) {
+                t = FORMAT_GREEK;
+            } else if (tok.toktype == TOK_UNICODE) {
+                t = FORMAT_UNICODE;
+            } else if (tok.toktype == TOK_REF) {
+                t = FORMAT_CITATION;
+                if (citationQPtr && !lookupCitation(*citationQPtr, tok.toktext)) {
+                    addCitation(citationQPtr, tok.toktext);
+                }
+            }
+            elt.elementType = t;
+            APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
+        }
+    }
+    if (em) {
+        /* unmatched star -- close the emphasis here */
+        FormatElement* content = NULL;
+        FormatElement elt = { .elementType = FORMAT_EM };
+        QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content);
+        elt.elementContentLength = QUEUE_LENGTH(formatElementQ);
+        elt.elementContent.nestedContent = content;
+        formatElementQ = STACK_HEAD(st);
+        APPEND_QUEUE(FormatElementQueue, formatElementQ, elt);
+        POP_STACK(FEQueueStack, st);
+    }
+
+    QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, *dst);
+    listLength = QUEUE_LENGTH(formatElementQ);
+    DESTROY_QUEUE(FormatElementQueue, formatElementQ);
+    freeTokenizer(tokenizer);
+    return listLength;
+}
+
+void freeFormatElementArray(FormatElement* a, int length) {
+    int i = 0;
+    for (i = 0; i < length; i++) {
+        if (a[i].elementType == FORMAT_EM) {
+            freeFormatElementArray(a[i].elementContent.nestedContent,
+                                   a[i].elementContentLength);
+        } else {
+            free(a[i].elementContent.textContent);
+        }
+    }
+    free(a);
+}
+
+#ifdef FORMATTER_TEST
+#include <stdio.h>
+
+const char* str = "My name in Chinese is \xe7\x86\x8a\xe5\xa4\xa7\xe8\xa1\x9b, or *xiao da wei*.  My favorite Greek passage is \xe1\xbc\x90\xce\xbd \xe1\xbc\x80\xcf\x81\xcf\x87\xe1\xbf\x87 \xe1\xbc\xa6\xce\xbd \xe1\xbd\x81 \xce\xbb\xe1\xbd\xb9\xce\xb3\xce\xbf\xcf\x82.^{cite}";
+
+int
+main() {
+    FormatElement* lst;
+    int l = formatText(str, &lst);
+    return 0;
+}
+#endif