From cfc0ba7e9a9d1b965e8dfc37307065a33236ecb3 Mon Sep 17 00:00:00 2001 From: David Baer Date: Thu, 6 Aug 2015 14:31:38 -0400 Subject: [PATCH] Numerous updates: * Add makefile to install data * Reference text is now formatted * UTF-8 string processing --- .gitignore | 1 - Makefile.am | 2 +- configure.ac | 1 + data/Makefile.am | 2 + src/Makefile.am | 4 +- src/main.c | 11 ++++ src/sermon.h | 2 +- src/sermon_parser.y | 2 +- src/sermon_util.c | 2 +- src/utf8.c | 124 ++++++++++++++++++++++++++++++++++++++++++++ src/utf8.h | 19 +++++++ src/xml.c | 99 +++++++++++++++++++++++++++++++++++ src/xml.h | 7 +++ 13 files changed, 269 insertions(+), 7 deletions(-) create mode 100644 data/Makefile.am create mode 100644 src/utf8.c create mode 100644 src/utf8.h create mode 100644 src/xml.c create mode 100644 src/xml.h diff --git a/.gitignore b/.gitignore index 399b094..c1afdf1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ Makefile -Makefile.bak Makefile.in aclocal.m4 autom4te.cache/ diff --git a/Makefile.am b/Makefile.am index 38bdf12..3426677 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,2 +1,2 @@ -SUBDIRS = src +SUBDIRS = src data dist_doc_DATA = README diff --git a/configure.ac b/configure.ac index c3eec8b..c5f2353 100644 --- a/configure.ac +++ b/configure.ac @@ -9,5 +9,6 @@ AC_CONFIG_HEADERS([config.h]) AC_CONFIG_FILES([ Makefile src/Makefile + data/Makefile ]) AC_OUTPUT diff --git a/data/Makefile.am b/data/Makefile.am new file mode 100644 index 0000000..6c8e803 --- /dev/null +++ b/data/Makefile.am @@ -0,0 +1,2 @@ +pkgdata_DATA = *.xsl sermon.dtd +EXTRA_DIST = *.xsl sermon.dtd diff --git a/src/Makefile.am b/src/Makefile.am index 34bb5c8..2de3cb6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,8 +1,8 @@ -AM_CPPFLAGS = ${libxml2_CFLAGS} ${libxslt_CFLAGS} +AM_CPPFLAGS = ${libxml2_CFLAGS} ${libxslt_CFLAGS} -DDATADIR=\"$(pkgdatadir)\" bin_PROGRAMS = sermon BUILT_SOURCES = sermon_lexer.c sermon_parser.c sermon_parser.h AM_YFLAGS = -d --location -sermon_SOURCES = sermon_lexer.l sermon_parser.y sermon_util.c main.c +sermon_SOURCES = sermon_lexer.l sermon_parser.y sermon_util.c main.c xml.c sermon_LDADD = ${libxml2_LIBS} ${libxslt_LIBS} CLEANFILES = sermon_lexer.c sermon_parser.c sermon_parser.h LIBS = $(LEXLIB) diff --git a/src/main.c b/src/main.c index 7a49045..b460d42 100644 --- a/src/main.c +++ b/src/main.c @@ -1,7 +1,9 @@ #include #include #include +#include #include "sermon.h" +#include "xml.h" extern int yyparse(Sermon *); extern FILE* yyin; @@ -16,6 +18,7 @@ void usage(const char* progname) { int main(int argc, char* argv[]) { Sermon sermon; + xmlDocPtr document; int i = 0, block = 0, normal = 0; const char* progname = argv[0], *filename = NULL; while (++i < argc) { @@ -38,6 +41,7 @@ int main(int argc, char* argv[]) { yyin = fopen(argv[1], "rt"); } yyparse(&sermon); + /* printf("Parsed sermon.\n"); printf("TITLE=%s\n", sermon.sermonTitle ? sermon.sermonTitle : "none"); printf("AUTHOR=%s\n", sermon.sermonAuthor ? sermon.sermonAuthor : "none"); @@ -55,6 +59,13 @@ int main(int argc, char* argv[]) { printf(" - %s: %s\n", sermon.sermonReferences[i].refId, sermon.sermonReferences[i].refText); } printf("\n"); + */ + + document = sermonToXmlDoc(&sermon); + printXML(document); + xmlFreeDoc(document); + + /* clean up, clean up, everybody, everywhere! */ FreeSermon(&sermon); if (strcmp(filename, "-") != 0) { fclose(yyin); diff --git a/src/sermon.h b/src/sermon.h index 4487011..400e32c 100644 --- a/src/sermon.h +++ b/src/sermon.h @@ -18,7 +18,7 @@ typedef struct { typedef struct { char* refId; - char* refText; + SermonParagraph refText; } SermonReference; typedef struct { diff --git a/src/sermon_parser.y b/src/sermon_parser.y index 3585c7e..0942f42 100644 --- a/src/sermon_parser.y +++ b/src/sermon_parser.y @@ -126,7 +126,7 @@ references: | /* empty */ ; reference: - '{' KW_REF ':' ID ':' REFTEXT '}' { SermonReference r = { .refId = $4, .refText = $6 }; APPEND_QUEUE(ReferenceQueue, referenceQ, r); } + '{' KW_REF ':' ID ':' REFTEXT '}' { SermonReference r = { .refId = $4, .refText = { .paraType = PARA_DEFAULT, .paraText = $6 } }; APPEND_QUEUE(ReferenceQueue, referenceQ, r); } ; %% diff --git a/src/sermon_util.c b/src/sermon_util.c index e1ddb2e..b981e36 100644 --- a/src/sermon_util.c +++ b/src/sermon_util.c @@ -23,7 +23,7 @@ void FreeSermon(Sermon* srm) { if (srm->numReferences) { for (i = 0; i < srm->numReferences; i++) { free(srm->sermonReferences[i].refId); - free(srm->sermonReferences[i].refText); + free(srm->sermonReferences[i].refText.paraText); } free(srm->sermonReferences); } diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..1d5b074 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,124 @@ +#include +#include +#include "utf8.h" + +utf8iterator* +utf8NewIterator(const char* txt) { + utf8iterator* result = malloc(sizeof(utf8iterator)); + memset(result, 0, sizeof(utf8iterator)); + result->txt = txt; + return result; +} + +uint32_t utf8CharAt(const utf8iterator* iter) { + int byteIndex = iter->byteIndex; + if ((iter->txt[byteIndex] & 0x80) == 0) { + return iter->txt[byteIndex]; + } else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) { + return (uint32_t)(iter->txt[byteIndex + 1] & 0x3f) | + ((uint32_t)(iter->txt[byteIndex] & 0x1f) << 6); + } else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) { + return (uint32_t)(iter->txt[byteIndex + 2] & 0x3f) | + ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 6) | + ((uint32_t)(iter->txt[byteIndex] & 0xf) << 12); + } else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) { + return (uint32_t)(iter->txt[byteIndex + 3] & 0x3f) | + ((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 6) | + ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 12) | + ((uint32_t)(iter->txt[byteIndex] & 0x07) << 18); + } else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) { + return (uint32_t)(iter->txt[byteIndex + 4] & 0x3f) | + ((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 6) | + ((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 12) | + ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 18) | + ((uint32_t)(iter->txt[byteIndex] & 0x03) << 24); + + } else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) && + ((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 4] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) { + return (uint32_t)(iter->txt[byteIndex + 5] & 0x3f) | + ((uint32_t)(iter->txt[byteIndex + 4] & 0x3f) << 6) | + ((uint32_t)(iter->txt[byteIndex + 3] & 0x3f) << 12) | + ((uint32_t)(iter->txt[byteIndex + 2] & 0x3f) << 18) | + ((uint32_t)(iter->txt[byteIndex + 1] & 0x3f) << 24) | + ((uint32_t)(iter->txt[byteIndex] & 0x01) << 30); + } + return 0; +} + +static int +_next_offset(const utf8iterator* iter) { + int byteIndex = iter->byteIndex; + if ((iter->txt[byteIndex] & 0x80) == 0) { + return 1; + } else if (((iter->txt[byteIndex] & 0xe0) == 0xc0) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80)) { + return 2; + } else if (((iter->txt[byteIndex] & 0xf0) == 0xe0) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80)) { + return 3; + } else if (((iter->txt[byteIndex] & 0xf8) == 0xf0) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 3] & 0xc0) == 0x80)) { + return 4; + } else if (((iter->txt[byteIndex] & 0xfc) == 0xf8) && + ((iter->txt[byteIndex + 1] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 4] & 0xc0) == 0x80)) { + return 5; + } else if (((iter->txt[byteIndex] & 0xf7) == 0xfc) && + ((iter->txt[byteIndex + 1]) & 0xc0 == 0x80) && + ((iter->txt[byteIndex + 2] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 3] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 4] & 0xc0) == 0x80) && + ((iter->txt[byteIndex + 5] & 0xc0) == 0x80)) { + return 6; + } else return -1; +} + +int +utf8Advance(utf8iterator* iter) { + int next_offset = _next_offset(iter); + if (next_offset > 0) { + iter->byteIndex += next_offset; + return ++iter->logicalIndex; + } else return -1; +} + +int +utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest) { + size_t sz = _next_offset(iter); + if (sz + 1 > bufsize) { + return 0; + } else { + memset(dest, 0, bufsize); + memcpy(dest, iter->txt + iter->byteIndex, sz); + return 1; + } +} + +void +utf8ResetIterator(utf8iterator* iter) { + iter->logicalIndex = iter->byteIndex = 0; +} + +void +utf8FreeIterator(utf8iterator* iter) { + free(iter); +} diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..8811b6c --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,19 @@ +#ifndef _UTF8_H +#define _UTF8_H + +#include + +typedef struct { + const char* txt; + int byteIndex; + int logicalIndex; +} utf8iterator; + +utf8iterator* utf8NewIterator(const char* txt); +int utf8Advance(utf8iterator*); +uint32_t utf8CharAt(const utf8iterator*); +void utf8ResetIterator(utf8iterator*); +void utf8FreeIterator(utf8iterator*); +int utf8CopyEncodedCharAt(const utf8iterator* iter, size_t bufsize, char* dest); + +#endif /* !def _UTF8_H */ diff --git a/src/xml.c b/src/xml.c new file mode 100644 index 0000000..1591bb8 --- /dev/null +++ b/src/xml.c @@ -0,0 +1,99 @@ +#include +#include "sermon.h" + +static void +appendHeaderNode(xmlNodePtr headerNode, const char* headerName, + const char* headerText) { + if (headerText) { + xmlNodePtr ptr = xmlNewNode(NULL, headerName); + xmlAddChild(ptr, xmlNewText(headerText)); + xmlAddChild(headerNode, ptr); + } +} + +xmlNodePtr +sermonHeader(const Sermon* srm) { + xmlNodePtr header = xmlNewNode(NULL, "header"); + appendHeaderNode(header, "title", srm->sermonTitle); + appendHeaderNode(header, "author", srm->sermonAuthor); + appendHeaderNode(header, "occasion", srm->sermonOccasion); + appendHeaderNode(header, "date", srm->sermonDate); + appendHeaderNode(header, "text", srm->sermonText); + return header; +} + + +static xmlNodePtr +paragraphToXML(const SermonParagraph* p) { + xmlNodePtr result = xmlNewNode(NULL, "p"); + xmlAddChild(result, xmlNewText(p->paraText)); + return result; +} + +xmlNodePtr +sermonBody(const Sermon* srm) { + xmlNodePtr body = xmlNewNode(NULL, "body"); + xmlNodePtr block = NULL; + int i = 0; + for (i = 0; i < srm->numParagraphs; i++) { + const SermonParagraph* p = &srm->sermonParagraphs[i]; + xmlNodePtr para = paragraphToXML(p); + if (p->paraType == PARA_BLOCKQUOTE) { + if (!block) { + block = xmlNewNode(NULL, "quote"); + xmlAddChild(body, block); + } + xmlAddChild(block, para); + } else { + block = NULL; + xmlAddChild(body, para); + } + } + return body; +} + +xmlNodePtr +sermonFooter(const Sermon* srm) { + xmlNodePtr footer = xmlNewNode(NULL, "footer"); + int i = 0; + char num[10]; + for (i = 0; i < srm->numReferences; i++) { + const SermonReference* r = &srm->sermonReferences[i]; + xmlNodePtr ref = xmlNewNode(NULL, "ref"); + snprintf(num, 10, "%d", i + 1); + xmlNewProp(ref, "number", num); + xmlAddChild(ref, paragraphToXML(&r->refText)); + xmlAddChild(footer, ref); + } + return footer; +} + +xmlDocPtr +sermonToXmlDoc(const Sermon* srm) { + /* document creation and setup */ + xmlDocPtr document = xmlNewDoc("1.0"); + xmlDtdPtr dtd = xmlCreateIntSubset(document, "sermon", NULL, DATADIR "/sermon.dtd"); + xmlNodePtr sermon = xmlNewNode(NULL, "sermon"); + xmlNsPtr sermon_ns = xmlNewNs(sermon, "urn:david-sermon", NULL); + xmlDocSetRootElement(document, sermon); + + /* add header */ + xmlAddChild(sermon, sermonHeader(srm)); + + /* add body paragraphs */ + xmlAddChild(sermon, sermonBody(srm)); + + if (srm->numReferences) { + /* add footer */ + xmlAddChild(sermon, sermonFooter(srm)); + } + + return document; +} + +void +printXML(xmlDocPtr document) { + xmlCharEncodingHandlerPtr encoding = xmlFindCharEncodingHandler("utf-8"); + xmlOutputBufferPtr output = xmlOutputBufferCreateFd(1, encoding); + xmlSaveFileTo(output, document, "utf-8"); +} diff --git a/src/xml.h b/src/xml.h new file mode 100644 index 0000000..6a74710 --- /dev/null +++ b/src/xml.h @@ -0,0 +1,7 @@ +#ifndef _XML_H +#define _XML_H + +xmlDocPtr sermonToXmlDoc(const Sermon*); +void printXML(xmlDocPtr); + +#endif /* !def _XML_H */