/* * format.c * Copyright © 2015 David A. Baer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the organization nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY David A. Baer ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL David A. Baer BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include #include "queue.h" #include "stack.h" #include "utf8.h" #include "format.h" DEFINE_QUEUE(FormatElement, FormatElementQueue); #define EM_DASH_UTF8 "\xe2\x80\x94" typedef utf8iterator* Tokenizer; typedef enum { TOK_TEXT, TOK_GREEK, TOK_UNICODE, TOK_STAR, TOK_REF, /* TOK_DASH, TOK_OPEN_DOUBLE_QUOTE, TOK_CLOSE_DOUBLE_QUOTE, TOK_OPEN_SINGLE_QUOTE, TOK_CLOSE_SINGLE_QUOTE, */ TOK_EOF } TokenType; typedef struct { TokenType toktype; char* toktext; } Token; static Tokenizer initializeTokenizer(const char* txt) { return utf8NewIterator(txt); } static void freeTokenizer(utf8iterator* iter) { utf8FreeIterator(iter); } inline int greekChar(uint32_t ch) { return (((0x370 <= ch) && (ch <= 0x3ff)) || ((0x1f00 <= ch) && (ch <= 0x1fff))); } inline int extendedPunctuation(uint32_t ch) { return ((0x2000 <= ch) && (ch <= 0x206f)); } inline int latinChar(uint32_t ch) { return (ch <= 0xff) || extendedPunctuation(ch); } static Token nextToken(Tokenizer tokenizer) { int startIndex = tokenizer->byteIndex; uint32_t ch = utf8CharAt(tokenizer); Token result; memset(&result, 0, sizeof(result)); if (ch == '\0') { result.toktype = TOK_EOF; result.toktext = NULL; return result; } else if (ch == '*') { utf8Advance(tokenizer); result.toktype = TOK_STAR; result.toktext = NULL; return result; } else if (greekChar(ch)) { while ((ch != 0) && (greekChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.'))) { utf8Advance(tokenizer); ch = utf8CharAt(tokenizer); } result.toktype = TOK_GREEK; result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex); return result; } else if ((ch == '^') && (tokenizer->txt[tokenizer->byteIndex + 1] == '{')) { int idStart = 0, idEnd = 0; utf8Advance(tokenizer); /* to { */ utf8Advance(tokenizer); /* to id */ ch = utf8CharAt(tokenizer); idStart = tokenizer->byteIndex; while ((ch != 0) && (ch != '}')) { utf8Advance(tokenizer); ch = utf8CharAt(tokenizer); } idEnd = tokenizer->byteIndex; /* reached end-of-string or } */ if (ch == '}') utf8Advance(tokenizer); result.toktype = TOK_REF; result.toktext = strndup(tokenizer->txt + idStart, idEnd - idStart); return result; } else if (latinChar(ch)) { while ((ch != 0) && latinChar(ch) && (ch != '*')) { utf8Advance(tokenizer); ch = utf8CharAt(tokenizer); if (ch == '^') { if (tokenizer->txt[tokenizer->byteIndex + 1] == '{') break; } } result.toktype = TOK_TEXT; result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex); return result; } else { while ((ch != 0) && (!latinChar(ch) || (ch == ' ') || (ch == ',') || (ch == '.') || extendedPunctuation(ch))) { utf8Advance(tokenizer); ch = utf8CharAt(tokenizer); } result.toktype = TOK_UNICODE; result.toktext = strndup(tokenizer->txt + startIndex, tokenizer->byteIndex - startIndex); return result; } } DEFINE_STACK(FormatElementQueue, FEQueueStack); int formatText(const char* txt, FormatElement** dst, CitationRecordQueue* citationQPtr) { Tokenizer tokenizer = initializeTokenizer(txt); Token tok; int listLength = 0, em = 0; NEW_QUEUE(FormatElementQueue, formatElementQ); NEW_STACK(FEQueueStack, st); while ((tok = nextToken(tokenizer)).toktype != TOK_EOF) { if (tok.toktype == TOK_STAR) { if (em) { /* end emphasis */ FormatElement* content = NULL; FormatElement elt = { .elementType = FORMAT_EM }; QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content); elt.elementContentLength = QUEUE_LENGTH(formatElementQ); elt.elementContent.nestedContent = content; formatElementQ = STACK_HEAD(st); APPEND_QUEUE(FormatElementQueue, formatElementQ, elt); POP_STACK(FEQueueStack, st); em = 0; } else { /* begin emphasis */ PUSH_STACK(FEQueueStack, st, formatElementQ); REINIT_QUEUE(formatElementQ); em = 1; } } else { FormatElementType t; FormatElement elt = { .elementContent = { .textContent = tok.toktext } } ; if (tok.toktype == TOK_TEXT) { t = FORMAT_TEXT; } else if (tok.toktype == TOK_GREEK) { t = FORMAT_GREEK; } else if (tok.toktype == TOK_UNICODE) { t = FORMAT_UNICODE; } else if (tok.toktype == TOK_REF) { t = FORMAT_CITATION; if (citationQPtr && !lookupCitation(*citationQPtr, tok.toktext)) { addCitation(citationQPtr, tok.toktext); } } elt.elementType = t; APPEND_QUEUE(FormatElementQueue, formatElementQ, elt); } } if (em) { /* unmatched star -- close the emphasis here */ FormatElement* content = NULL; FormatElement elt = { .elementType = FORMAT_EM }; QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, content); elt.elementContentLength = QUEUE_LENGTH(formatElementQ); elt.elementContent.nestedContent = content; formatElementQ = STACK_HEAD(st); APPEND_QUEUE(FormatElementQueue, formatElementQ, elt); POP_STACK(FEQueueStack, st); } QUEUE_TO_ARRAY(FormatElementQueue, formatElementQ, FormatElement, *dst); listLength = QUEUE_LENGTH(formatElementQ); DESTROY_QUEUE(FormatElementQueue, formatElementQ); freeTokenizer(tokenizer); return listLength; } void freeFormatElementArray(FormatElement* a, int length) { int i = 0; for (i = 0; i < length; i++) { if (a[i].elementType == FORMAT_EM) { freeFormatElementArray(a[i].elementContent.nestedContent, a[i].elementContentLength); } else { free(a[i].elementContent.textContent); } } free(a); }