如何使用 ruby 和欧芹解析 rtf 文本?
How do I parse rtf text using ruby and parslet?
我有来自富文本格式 (RTF) 文件的以下数据:
{\rtf1\ansi\deff3\adeflang1025\n{\fonttbl{\f0\froman\fprq2\fcharset0
Times New Roman;}{\f1\froman\fprq2\fcharset2
Symbol;}{\f2\fswiss\fprq2\fcharset0
Arial;}{\f3\froman\fprq2\fcharset128 Times New
Roman;}{\f4\fswiss\fprq2\fcharset128
Arial;}{\f5\fnil\fprq2\fcharset128 Droid Sans
Fallback;}{\f6\fnil\fprq2\fcharset128 DejaVu
Sans;}{\f7\fswiss\fprq0\fcharset128 DejaVu
Sans;}}\n{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}\n{\stylesheet{\s0\snext0\nowidctlpar{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\cf0\kerning1\hich\af5\langfe2052\dbch\af6\afs24\lang1081\loch\f3\fs24\lang1033
Default;}\n{\s15\sbasedon0\snext16\sb240\sa120\keepn\hich\af5\dbch\af6\afs28\loch\f4\fs28
Heading;}\n{\s16\sbasedon0\snext16\sb0\sa120 Text
body;}\n{\s17\sbasedon16\snext17\sb0\sa120\dbch\af7
List;}\n{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24
Caption;}\n{\s19\sbasedon0\snext19\noline\dbch\af7
Index;}\n}{\info{\creatim\yr2018\mo7\dy15\hr11\min52}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment
OpenOffice}{\vern4140}}\deftab709\n\n{\*\pgdsctbl\n{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0
Default;}}\n\formshade\paperh15840\paperw12240\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc\n\pgndec\pard\plain
\s0\nowidctlpar{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\cf0\kerning1\hich\af5\langfe2052\dbch\af6\afs24\lang1081\loch\f3\fs24\lang1033{\rtlch
\ltrch\loch\nI like to read.}\n\par }
按照 Rob Miller "Text Processing with Ruby" 的示例,我有以下 Parslet 解析器:
require "parslet"
class Rtf < Parslet::Parser
rule(:space) { str(" ") }
rule(:hypen) { str("-") }
rule(:integer) { match["0-9"].repeat(1) }
rule(:newline) { str("\n") }
rule(:slash) { str("\") }
rule(:letter_sequence) { match["a-z"].repeat }
rule(:special_chars) { match["\\{}"] }
rule(:unformatted_text) { ( special_chars.absent? >> any ).repeat(1).as(:text) }
rule(:control_word) { ( slash >>
letter_sequence.as(:word) >>
control_delimiter.maybe.as(:delimiter)
).as(:control_word)
}
rule(:control_delimiter) { space | ( hypen.maybe >> integer ) | str(";") }
rule(:group) {
(
str("{") >>
newline.maybe >>
content >>
newline.maybe >>
str("}")
)
}
rule(:content) {
(
unformatted_text | control_word | group
).repeat
}
rule(:header) {
( slash >> str("rtf") >> integer.maybe.as(:version) ).as(:rtf) >>
( slash >> letter_sequence.as(:charset) ) >>
( slash >> str("deff") >> integer.maybe ).maybe.as(:deff) >>
color_table.maybe.as(:color_table) >>
newline.maybe
}
rule(:color_table) {
newline.maybe >>
str("{") >>
( slash >> str("colortabl;") ) >>
color_definition.repeat(1).as(:colors) >>
str("}") >>
newline.maybe
}
rule(:color_definition) {
slash >> str("red") >> (intger.as(:int)).as(:red) >>
slash >> str("green") >> (intger.as(:int)).as(:green) >>
slash >> str("blue") >> (intger.as(:int)).as(:blue) >>
str(";")
}
rule(:file) {
str("{") >>
header.as(:header) >>
content.as(:document) >>
str("}") >>
newline.maybe
}
root :file
end
使用上述 Parslet 解析 rtf 文件产生:
(byebug) parsed {:header=>{:rtf=>{:version=>"1"@5},
:charset=>"ansi"@7, :deff=>"\deff3"@11, :color_table=>nil},
:document=>[{:control_word=>{:word=>"adeflang"@18,
:delimiter=>"1025"@26}}, {:text=>"\n"@30}, {:text=>"\n"@374},
{:text=>"\n"@431}, {:control_word=>{:word=>"deftab"@1050,
:delimiter=>"709"@1056}}, {:text=>"\n\n"@1059}, {:text=>"\n"@1191},
{:control_word=>{:word=>"formshade"@1193, :delimiter=>nil}},
{:control_word=>{:word=>"paperh"@1203, :delimiter=>"15840"@1209}},
{:control_word=>{:word=>"paperw"@1215, :delimiter=>"12240"@1221}},
{:control_word=>{:word=>"margl"@1227, :delimiter=>"1134"@1232}},
{:control_word=>{:word=>"margr"@1237, :delimiter=>"1134"@1242}},
{:control_word=>{:word=>"margt"@1247, :delimiter=>"1134"@1252}},
{:control_word=>{:word=>"margb"@1257, :delimiter=>"1134"@1262}},
{:control_word=>{:word=>"sectd"@1267, :delimiter=>nil}},
{:control_word=>{:word=>"sbknone"@1273, :delimiter=>nil}},
{:control_word=>{:word=>"sectunlocked"@1281, :delimiter=>"1"@1293}},
{:control_word=>{:word=>"pgndec"@1295, :delimiter=>nil}},
{:control_word=>{:word=>"pgwsxn"@1302, :delimiter=>"12240"@1308}},
{:control_word=>{:word=>"pghsxn"@1314, :delimiter=>"15840"@1320}},
{:control_word=>{:word=>"marglsxn"@1326, :delimiter=>"1134"@1334}},
{:control_word=>{:word=>"margrsxn"@1339, :delimiter=>"1134"@1347}},
{:control_word=>{:word=>"margtsxn"@1352, :delimiter=>"1134"@1360}},
{:control_word=>{:word=>"margbsxn"@1365, :delimiter=>"1134"@1373}},
{:control_word=>{:word=>"ftnbj"@1378, :delimiter=>nil}},
{:control_word=>{:word=>"ftnstart"@1384, :delimiter=>"1"@1392}},
{:control_word=>{:word=>"ftnrstcont"@1394, :delimiter=>nil}},
{:control_word=>{:word=>"ftnnar"@1405, :delimiter=>nil}},
{:control_word=>{:word=>"aenddoc"@1412, :delimiter=>nil}},
{:control_word=>{:word=>"aftnrstcont"@1420, :delimiter=>nil}},
{:control_word=>{:word=>"aftnstart"@1432, :delimiter=>"1"@1441}},
{:control_word=>{:word=>"aftnnrlc"@1443, :delimiter=>nil}},
{:text=>"\n"@1451}, {:control_word=>{:word=>"pgndec"@1453,
:delimiter=>nil}}, {:control_word=>{:word=>"pard"@1460,
:delimiter=>nil}}, {:control_word=>{:word=>"plain"@1465, :delimiter=>"
"@1470}}, {:control_word=>{:word=>"s"@1472, :delimiter=>"0"@1473}},
{:control_word=>{:word=>"nowidctlpar"@1475, :delimiter=>nil}},
{:control_word=>{:word=>"cf"@1529, :delimiter=>"0"@1531}},
{:control_word=>{:word=>"kerning"@1533, :delimiter=>"1"@1540}},
{:control_word=>{:word=>"hich"@1542, :delimiter=>nil}},
{:control_word=>{:word=>"af"@1547, :delimiter=>"5"@1549}},
{:control_word=>{:word=>"langfe"@1551, :delimiter=>"2052"@1557}},
{:control_word=>{:word=>"dbch"@1562, :delimiter=>nil}},
{:control_word=>{:word=>"af"@1567, :delimiter=>"6"@1569}},
{:control_word=>{:word=>"afs"@1571, :delimiter=>"24"@1574}},
{:control_word=>{:word=>"lang"@1577, :delimiter=>"1081"@1581}},
{:control_word=>{:word=>"loch"@1586, :delimiter=>nil}},
{:control_word=>{:word=>"f"@1591, :delimiter=>"3"@1592}},
{:control_word=>{:word=>"fs"@1594, :delimiter=>"24"@1596}},
{:control_word=>{:word=>"lang"@1599, :delimiter=>"1033"@1603}},
{:text=>"\n"@1643}, {:control_word=>{:word=>"par"@1645, :delimiter=>"
"@1648}}]}
RTF 文件中文本的 None,即 "I like to read.",被解析了,我不知道为什么。任何指导将不胜感激。
这是因为您缺少所有组。
rule(:group) {
(
str("{") >>
newline.maybe >>
content >>
newline.maybe >>
str("}")
).as(:group)
}
添加'as(:group)'
现在你得到了
{:header=>
{:rtf=>{:version=>"1"@5},
:charset=>"ansi"@7,
:deff=>"\deff3"@11,
:color_table=>nil},
:document=>
[{:control_word=>{:word=>"adeflang"@18, :delimiter=>"1025"@26}},
{:control_word=>{:word=>"n"@31, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"fonttbl"@34, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"f"@43, :delimiter=>"0"@44}},
{:control_word=>{:word=>"froman"@46, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@53, :delimiter=>"2"@57}},
{:control_word=>{:word=>"fcharset"@59, :delimiter=>"0"@67}},
{:text=>" Times New Roman;"@68}]},
{:group=>
[{:control_word=>{:word=>"f"@88, :delimiter=>"1"@89}},
{:control_word=>{:word=>"froman"@91, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@98, :delimiter=>"2"@102}},
{:control_word=>{:word=>"fcharset"@104, :delimiter=>"2"@112}},
{:text=>" Symbol;"@113}]},
{:group=>
[{:control_word=>{:word=>"f"@124, :delimiter=>"2"@125}},
{:control_word=>{:word=>"fswiss"@127, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@134, :delimiter=>"2"@138}},
{:control_word=>{:word=>"fcharset"@140, :delimiter=>"0"@148}},
{:text=>" Arial;"@149}]},
{:group=>
[{:control_word=>{:word=>"f"@159, :delimiter=>"3"@160}},
{:control_word=>{:word=>"froman"@162, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@169, :delimiter=>"2"@173}},
{:control_word=>{:word=>"fcharset"@175, :delimiter=>"128"@183}},
{:text=>" Times New Roman;"@186}]},
{:group=>
[{:control_word=>{:word=>"f"@206, :delimiter=>"4"@207}},
{:control_word=>{:word=>"fswiss"@209, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@216, :delimiter=>"2"@220}},
{:control_word=>{:word=>"fcharset"@222, :delimiter=>"128"@230}},
{:text=>" Arial;"@233}]},
{:group=>
[{:control_word=>{:word=>"f"@243, :delimiter=>"5"@244}},
{:control_word=>{:word=>"fnil"@246, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@251, :delimiter=>"2"@255}},
{:control_word=>{:word=>"fcharset"@257, :delimiter=>"128"@265}},
{:text=>" Droid Sans Fallback;"@268}]},
{:group=>
[{:control_word=>{:word=>"f"@292, :delimiter=>"6"@293}},
{:control_word=>{:word=>"fnil"@295, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@300, :delimiter=>"2"@304}},
{:control_word=>{:word=>"fcharset"@306, :delimiter=>"128"@314}},
{:text=>" DejaVu Sans;"@317}]},
{:group=>
[{:control_word=>{:word=>"f"@333, :delimiter=>"7"@334}},
{:control_word=>{:word=>"fswiss"@336, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@343, :delimiter=>"0"@347}},
{:control_word=>{:word=>"fcharset"@349, :delimiter=>"128"@357}},
{:text=>" DejaVu Sans;"@360}]}]},
{:control_word=>{:word=>"n"@376, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"colortbl"@379, :delimiter=>";"@387}},
{:control_word=>{:word=>"red"@389, :delimiter=>"0"@392}},
{:control_word=>{:word=>"green"@394, :delimiter=>"0"@399}},
{:control_word=>{:word=>"blue"@401, :delimiter=>"0"@405}},
{:text=>";"@406},
{:control_word=>{:word=>"red"@408, :delimiter=>"128"@411}},
{:control_word=>{:word=>"green"@415, :delimiter=>"128"@420}},
{:control_word=>{:word=>"blue"@424, :delimiter=>"128"@428}},
{:text=>";"@431}]},
{:control_word=>{:word=>"n"@434, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"stylesheet"@437, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@449, :delimiter=>"0"@450}},
{:control_word=>{:word=>"snext"@452, :delimiter=>"0"@457}},
{:control_word=>{:word=>"nowidctlpar"@459, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>[], :delimiter=>nil}},
{:text=>"*"@472},
{:control_word=>{:word=>"hyphen"@474, :delimiter=>"2"@480}},
{:control_word=>{:word=>"hyphlead"@482, :delimiter=>"2"@490}},
{:control_word=>{:word=>"hyphtrail"@492, :delimiter=>"2"@501}},
{:control_word=>{:word=>"hyphmax"@503, :delimiter=>"0"@510}}]},
{:control_word=>{:word=>"cf"@513, :delimiter=>"0"@515}},
{:control_word=>{:word=>"kerning"@517, :delimiter=>"1"@524}},
{:control_word=>{:word=>"hich"@526, :delimiter=>nil}},
{:control_word=>{:word=>"af"@531, :delimiter=>"5"@533}},
{:control_word=>{:word=>"langfe"@535, :delimiter=>"2052"@541}},
{:control_word=>{:word=>"dbch"@546, :delimiter=>nil}},
{:control_word=>{:word=>"af"@551, :delimiter=>"6"@553}},
{:control_word=>{:word=>"afs"@555, :delimiter=>"24"@558}},
{:control_word=>{:word=>"lang"@561, :delimiter=>"1081"@565}},
{:control_word=>{:word=>"loch"@570, :delimiter=>nil}},
{:control_word=>{:word=>"f"@575, :delimiter=>"3"@576}},
{:control_word=>{:word=>"fs"@578, :delimiter=>"24"@580}},
{:control_word=>{:word=>"lang"@583, :delimiter=>"1033"@587}},
{:text=>" Default;"@591}]},
{:control_word=>{:word=>"n"@602, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@605, :delimiter=>"15"@606}},
{:control_word=>{:word=>"sbasedon"@609, :delimiter=>"0"@617}},
{:control_word=>{:word=>"snext"@619, :delimiter=>"16"@624}},
{:control_word=>{:word=>"sb"@627, :delimiter=>"240"@629}},
{:control_word=>{:word=>"sa"@633, :delimiter=>"120"@635}},
{:control_word=>{:word=>"keepn"@639, :delimiter=>nil}},
{:control_word=>{:word=>"hich"@645, :delimiter=>nil}},
{:control_word=>{:word=>"af"@650, :delimiter=>"5"@652}},
{:control_word=>{:word=>"dbch"@654, :delimiter=>nil}},
{:control_word=>{:word=>"af"@659, :delimiter=>"6"@661}},
{:control_word=>{:word=>"afs"@663, :delimiter=>"28"@666}},
{:control_word=>{:word=>"loch"@669, :delimiter=>nil}},
{:control_word=>{:word=>"f"@674, :delimiter=>"4"@675}},
{:control_word=>{:word=>"fs"@677, :delimiter=>"28"@679}},
{:text=>" Heading;"@681}]},
{:control_word=>{:word=>"n"@692, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@695, :delimiter=>"16"@696}},
{:control_word=>{:word=>"sbasedon"@699, :delimiter=>"0"@707}},
{:control_word=>{:word=>"snext"@709, :delimiter=>"16"@714}},
{:control_word=>{:word=>"sb"@717, :delimiter=>"0"@719}},
{:control_word=>{:word=>"sa"@721, :delimiter=>"120"@723}},
{:text=>" Text body;"@726}]},
{:control_word=>{:word=>"n"@739, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@742, :delimiter=>"17"@743}},
{:control_word=>{:word=>"sbasedon"@746, :delimiter=>"16"@754}},
{:control_word=>{:word=>"snext"@757, :delimiter=>"17"@762}},
{:control_word=>{:word=>"sb"@765, :delimiter=>"0"@767}},
{:control_word=>{:word=>"sa"@769, :delimiter=>"120"@771}},
{:control_word=>{:word=>"dbch"@775, :delimiter=>nil}},
{:control_word=>{:word=>"af"@780, :delimiter=>"7"@782}},
{:text=>" List;"@783}]},
{:control_word=>{:word=>"n"@791, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@794, :delimiter=>"18"@795}},
{:control_word=>{:word=>"sbasedon"@798, :delimiter=>"0"@806}},
{:control_word=>{:word=>"snext"@808, :delimiter=>"18"@813}},
{:control_word=>{:word=>"sb"@816, :delimiter=>"120"@818}},
{:control_word=>{:word=>"sa"@822, :delimiter=>"120"@824}},
{:control_word=>{:word=>"noline"@828, :delimiter=>nil}},
{:control_word=>{:word=>"i"@835, :delimiter=>nil}},
{:control_word=>{:word=>"dbch"@837, :delimiter=>nil}},
{:control_word=>{:word=>"af"@842, :delimiter=>"7"@844}},
{:control_word=>{:word=>"afs"@846, :delimiter=>"24"@849}},
{:control_word=>{:word=>"ai"@852, :delimiter=>nil}},
{:control_word=>{:word=>"fs"@855, :delimiter=>"24"@857}},
{:text=>" Caption;"@859}]},
{:control_word=>{:word=>"n"@870, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@873, :delimiter=>"19"@874}},
{:control_word=>{:word=>"sbasedon"@877, :delimiter=>"0"@885}},
{:control_word=>{:word=>"snext"@887, :delimiter=>"19"@892}},
{:control_word=>{:word=>"noline"@895, :delimiter=>nil}},
{:control_word=>{:word=>"dbch"@902, :delimiter=>nil}},
{:control_word=>{:word=>"af"@907, :delimiter=>"7"@909}},
{:text=>" Index;"@910}]},
{:control_word=>{:word=>"n"@919, :delimiter=>nil}}]},
{:group=>
[{:control_word=>{:word=>"info"@923, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"creatim"@929, :delimiter=>nil}},
{:control_word=>{:word=>"yr"@937, :delimiter=>"2018"@939}},
{:control_word=>{:word=>"mo"@944, :delimiter=>"7"@946}},
{:control_word=>{:word=>"dy"@948, :delimiter=>"15"@950}},
{:control_word=>{:word=>"hr"@953, :delimiter=>"11"@955}},
{:control_word=>{:word=>"min"@958, :delimiter=>"52"@961}}]},
{:group=>
[{:control_word=>{:word=>"revtim"@966, :delimiter=>nil}},
{:control_word=>{:word=>"yr"@973, :delimiter=>"0"@975}},
{:control_word=>{:word=>"mo"@977, :delimiter=>"0"@979}},
{:control_word=>{:word=>"dy"@981, :delimiter=>"0"@983}},
{:control_word=>{:word=>"hr"@985, :delimiter=>"0"@987}},
{:control_word=>{:word=>"min"@989, :delimiter=>"0"@992}}]},
{:group=>
[{:control_word=>{:word=>"printim"@996, :delimiter=>nil}},
{:control_word=>{:word=>"yr"@1004, :delimiter=>"0"@1006}},
{:control_word=>{:word=>"mo"@1008, :delimiter=>"0"@1010}},
{:control_word=>{:word=>"dy"@1012, :delimiter=>"0"@1014}},
{:control_word=>{:word=>"hr"@1016, :delimiter=>"0"@1018}},
{:control_word=>{:word=>"min"@1020, :delimiter=>"0"@1023}}]},
{:group=>
[{:control_word=>{:word=>"comment"@1027, :delimiter=>" "@1034}},
{:text=>"OpenOffice"@1035}]},
{:group=>
[{:control_word=>{:word=>"vern"@1048, :delimiter=>"4140"@1052}}]}]},
{:control_word=>{:word=>"deftab"@1059, :delimiter=>"709"@1065}},
{:control_word=>{:word=>"n"@1069, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1071, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>[], :delimiter=>nil}},
{:text=>"*"@1074},
{:control_word=>{:word=>"pgdsctbl"@1076, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1085, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"pgdsc"@1088, :delimiter=>"0"@1093}},
{:control_word=>{:word=>"pgdscuse"@1095, :delimiter=>"195"@1103}},
{:control_word=>{:word=>"pgwsxn"@1107, :delimiter=>"12240"@1113}},
{:control_word=>{:word=>"pghsxn"@1119, :delimiter=>"15840"@1125}},
{:control_word=>{:word=>"marglsxn"@1131, :delimiter=>"1134"@1139}},
{:control_word=>{:word=>"margrsxn"@1144, :delimiter=>"1134"@1152}},
{:control_word=>{:word=>"margtsxn"@1157, :delimiter=>"1134"@1165}},
{:control_word=>{:word=>"margbsxn"@1170, :delimiter=>"1134"@1178}},
{:control_word=>{:word=>"pgdscnxt"@1183, :delimiter=>"0"@1191}},
{:text=>" Default;"@1192}]}]},
{:control_word=>{:word=>"n"@1204, :delimiter=>nil}},
{:control_word=>{:word=>"formshade"@1206, :delimiter=>nil}},
{:control_word=>{:word=>"paperh"@1216, :delimiter=>"15840"@1222}},
{:control_word=>{:word=>"paperw"@1228, :delimiter=>"12240"@1234}},
{:control_word=>{:word=>"margl"@1240, :delimiter=>"1134"@1245}},
{:control_word=>{:word=>"margr"@1250, :delimiter=>"1134"@1255}},
{:control_word=>{:word=>"margt"@1260, :delimiter=>"1134"@1265}},
{:control_word=>{:word=>"margb"@1270, :delimiter=>"1134"@1275}},
{:control_word=>{:word=>"sectd"@1280, :delimiter=>nil}},
{:control_word=>{:word=>"sbknone"@1286, :delimiter=>nil}},
{:control_word=>{:word=>"sectunlocked"@1294, :delimiter=>"1"@1306}},
{:control_word=>{:word=>"pgndec"@1308, :delimiter=>nil}},
{:control_word=>{:word=>"pgwsxn"@1315, :delimiter=>"12240"@1321}},
{:control_word=>{:word=>"pghsxn"@1327, :delimiter=>"15840"@1333}},
{:control_word=>{:word=>"marglsxn"@1339, :delimiter=>"1134"@1347}},
{:control_word=>{:word=>"margrsxn"@1352, :delimiter=>"1134"@1360}},
{:control_word=>{:word=>"margtsxn"@1365, :delimiter=>"1134"@1373}},
{:control_word=>{:word=>"margbsxn"@1378, :delimiter=>"1134"@1386}},
{:control_word=>{:word=>"ftnbj"@1391, :delimiter=>nil}},
{:control_word=>{:word=>"ftnstart"@1397, :delimiter=>"1"@1405}},
{:control_word=>{:word=>"ftnrstcont"@1407, :delimiter=>nil}},
{:control_word=>{:word=>"ftnnar"@1418, :delimiter=>nil}},
{:control_word=>{:word=>"aenddoc"@1425, :delimiter=>nil}},
{:control_word=>{:word=>"aftnrstcont"@1433, :delimiter=>nil}},
{:control_word=>{:word=>"aftnstart"@1445, :delimiter=>"1"@1454}},
{:control_word=>{:word=>"aftnnrlc"@1456, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1465, :delimiter=>nil}},
{:control_word=>{:word=>"pgndec"@1467, :delimiter=>nil}},
{:control_word=>{:word=>"pard"@1474, :delimiter=>nil}},
{:control_word=>{:word=>"plain"@1479, :delimiter=>" "@1484}},
{:control_word=>{:word=>"s"@1486, :delimiter=>"0"@1487}},
{:control_word=>{:word=>"nowidctlpar"@1489, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>[], :delimiter=>nil}},
{:text=>"*"@1502},
{:control_word=>{:word=>"hyphen"@1504, :delimiter=>"2"@1510}},
{:control_word=>{:word=>"hyphlead"@1512, :delimiter=>"2"@1520}},
{:control_word=>{:word=>"hyphtrail"@1522, :delimiter=>"2"@1531}},
{:control_word=>{:word=>"hyphmax"@1533, :delimiter=>"0"@1540}}]},
{:control_word=>{:word=>"cf"@1543, :delimiter=>"0"@1545}},
{:control_word=>{:word=>"kerning"@1547, :delimiter=>"1"@1554}},
{:control_word=>{:word=>"hich"@1556, :delimiter=>nil}},
{:control_word=>{:word=>"af"@1561, :delimiter=>"5"@1563}},
{:control_word=>{:word=>"langfe"@1565, :delimiter=>"2052"@1571}},
{:control_word=>{:word=>"dbch"@1576, :delimiter=>nil}},
{:control_word=>{:word=>"af"@1581, :delimiter=>"6"@1583}},
{:control_word=>{:word=>"afs"@1585, :delimiter=>"24"@1588}},
{:control_word=>{:word=>"lang"@1591, :delimiter=>"1081"@1595}},
{:control_word=>{:word=>"loch"@1600, :delimiter=>nil}},
{:control_word=>{:word=>"f"@1605, :delimiter=>"3"@1606}},
{:control_word=>{:word=>"fs"@1608, :delimiter=>"24"@1610}},
{:control_word=>{:word=>"lang"@1613, :delimiter=>"1033"@1617}},
{:group=>
[{:control_word=>{:word=>"rtlch"@1623, :delimiter=>" "@1628}},
{:control_word=>{:word=>"ltrch"@1630, :delimiter=>nil}},
{:control_word=>{:word=>"loch"@1636, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1641, :delimiter=>nil}},
{:text=>"I like to read."@1642}]},
{:control_word=>{:word=>"n"@1659, :delimiter=>nil}},
{:control_word=>{:word=>"par"@1661, :delimiter=>" "@1664}}]}
我有来自富文本格式 (RTF) 文件的以下数据:
{\rtf1\ansi\deff3\adeflang1025\n{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset128 Times New Roman;}{\f4\fswiss\fprq2\fcharset128 Arial;}{\f5\fnil\fprq2\fcharset128 Droid Sans Fallback;}{\f6\fnil\fprq2\fcharset128 DejaVu Sans;}{\f7\fswiss\fprq0\fcharset128 DejaVu Sans;}}\n{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}\n{\stylesheet{\s0\snext0\nowidctlpar{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\cf0\kerning1\hich\af5\langfe2052\dbch\af6\afs24\lang1081\loch\f3\fs24\lang1033 Default;}\n{\s15\sbasedon0\snext16\sb240\sa120\keepn\hich\af5\dbch\af6\afs28\loch\f4\fs28 Heading;}\n{\s16\sbasedon0\snext16\sb0\sa120 Text body;}\n{\s17\sbasedon16\snext17\sb0\sa120\dbch\af7 List;}\n{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 Caption;}\n{\s19\sbasedon0\snext19\noline\dbch\af7 Index;}\n}{\info{\creatim\yr2018\mo7\dy15\hr11\min52}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment OpenOffice}{\vern4140}}\deftab709\n\n{\*\pgdsctbl\n{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Default;}}\n\formshade\paperh15840\paperw12240\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc\n\pgndec\pard\plain \s0\nowidctlpar{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\cf0\kerning1\hich\af5\langfe2052\dbch\af6\afs24\lang1081\loch\f3\fs24\lang1033{\rtlch \ltrch\loch\nI like to read.}\n\par }
按照 Rob Miller "Text Processing with Ruby" 的示例,我有以下 Parslet 解析器:
require "parslet"
class Rtf < Parslet::Parser
rule(:space) { str(" ") }
rule(:hypen) { str("-") }
rule(:integer) { match["0-9"].repeat(1) }
rule(:newline) { str("\n") }
rule(:slash) { str("\") }
rule(:letter_sequence) { match["a-z"].repeat }
rule(:special_chars) { match["\\{}"] }
rule(:unformatted_text) { ( special_chars.absent? >> any ).repeat(1).as(:text) }
rule(:control_word) { ( slash >>
letter_sequence.as(:word) >>
control_delimiter.maybe.as(:delimiter)
).as(:control_word)
}
rule(:control_delimiter) { space | ( hypen.maybe >> integer ) | str(";") }
rule(:group) {
(
str("{") >>
newline.maybe >>
content >>
newline.maybe >>
str("}")
)
}
rule(:content) {
(
unformatted_text | control_word | group
).repeat
}
rule(:header) {
( slash >> str("rtf") >> integer.maybe.as(:version) ).as(:rtf) >>
( slash >> letter_sequence.as(:charset) ) >>
( slash >> str("deff") >> integer.maybe ).maybe.as(:deff) >>
color_table.maybe.as(:color_table) >>
newline.maybe
}
rule(:color_table) {
newline.maybe >>
str("{") >>
( slash >> str("colortabl;") ) >>
color_definition.repeat(1).as(:colors) >>
str("}") >>
newline.maybe
}
rule(:color_definition) {
slash >> str("red") >> (intger.as(:int)).as(:red) >>
slash >> str("green") >> (intger.as(:int)).as(:green) >>
slash >> str("blue") >> (intger.as(:int)).as(:blue) >>
str(";")
}
rule(:file) {
str("{") >>
header.as(:header) >>
content.as(:document) >>
str("}") >>
newline.maybe
}
root :file
end
使用上述 Parslet 解析 rtf 文件产生:
RTF 文件中文本的(byebug) parsed {:header=>{:rtf=>{:version=>"1"@5}, :charset=>"ansi"@7, :deff=>"\deff3"@11, :color_table=>nil}, :document=>[{:control_word=>{:word=>"adeflang"@18, :delimiter=>"1025"@26}}, {:text=>"\n"@30}, {:text=>"\n"@374}, {:text=>"\n"@431}, {:control_word=>{:word=>"deftab"@1050, :delimiter=>"709"@1056}}, {:text=>"\n\n"@1059}, {:text=>"\n"@1191}, {:control_word=>{:word=>"formshade"@1193, :delimiter=>nil}}, {:control_word=>{:word=>"paperh"@1203, :delimiter=>"15840"@1209}}, {:control_word=>{:word=>"paperw"@1215, :delimiter=>"12240"@1221}}, {:control_word=>{:word=>"margl"@1227, :delimiter=>"1134"@1232}}, {:control_word=>{:word=>"margr"@1237, :delimiter=>"1134"@1242}}, {:control_word=>{:word=>"margt"@1247, :delimiter=>"1134"@1252}}, {:control_word=>{:word=>"margb"@1257, :delimiter=>"1134"@1262}}, {:control_word=>{:word=>"sectd"@1267, :delimiter=>nil}}, {:control_word=>{:word=>"sbknone"@1273, :delimiter=>nil}}, {:control_word=>{:word=>"sectunlocked"@1281, :delimiter=>"1"@1293}}, {:control_word=>{:word=>"pgndec"@1295, :delimiter=>nil}}, {:control_word=>{:word=>"pgwsxn"@1302, :delimiter=>"12240"@1308}}, {:control_word=>{:word=>"pghsxn"@1314, :delimiter=>"15840"@1320}}, {:control_word=>{:word=>"marglsxn"@1326, :delimiter=>"1134"@1334}}, {:control_word=>{:word=>"margrsxn"@1339, :delimiter=>"1134"@1347}}, {:control_word=>{:word=>"margtsxn"@1352, :delimiter=>"1134"@1360}}, {:control_word=>{:word=>"margbsxn"@1365, :delimiter=>"1134"@1373}}, {:control_word=>{:word=>"ftnbj"@1378, :delimiter=>nil}}, {:control_word=>{:word=>"ftnstart"@1384, :delimiter=>"1"@1392}}, {:control_word=>{:word=>"ftnrstcont"@1394, :delimiter=>nil}}, {:control_word=>{:word=>"ftnnar"@1405, :delimiter=>nil}}, {:control_word=>{:word=>"aenddoc"@1412, :delimiter=>nil}}, {:control_word=>{:word=>"aftnrstcont"@1420, :delimiter=>nil}}, {:control_word=>{:word=>"aftnstart"@1432, :delimiter=>"1"@1441}}, {:control_word=>{:word=>"aftnnrlc"@1443, :delimiter=>nil}}, {:text=>"\n"@1451}, {:control_word=>{:word=>"pgndec"@1453, :delimiter=>nil}}, {:control_word=>{:word=>"pard"@1460, :delimiter=>nil}}, {:control_word=>{:word=>"plain"@1465, :delimiter=>" "@1470}}, {:control_word=>{:word=>"s"@1472, :delimiter=>"0"@1473}}, {:control_word=>{:word=>"nowidctlpar"@1475, :delimiter=>nil}}, {:control_word=>{:word=>"cf"@1529, :delimiter=>"0"@1531}}, {:control_word=>{:word=>"kerning"@1533, :delimiter=>"1"@1540}}, {:control_word=>{:word=>"hich"@1542, :delimiter=>nil}}, {:control_word=>{:word=>"af"@1547, :delimiter=>"5"@1549}}, {:control_word=>{:word=>"langfe"@1551, :delimiter=>"2052"@1557}}, {:control_word=>{:word=>"dbch"@1562, :delimiter=>nil}}, {:control_word=>{:word=>"af"@1567, :delimiter=>"6"@1569}}, {:control_word=>{:word=>"afs"@1571, :delimiter=>"24"@1574}}, {:control_word=>{:word=>"lang"@1577, :delimiter=>"1081"@1581}}, {:control_word=>{:word=>"loch"@1586, :delimiter=>nil}}, {:control_word=>{:word=>"f"@1591, :delimiter=>"3"@1592}}, {:control_word=>{:word=>"fs"@1594, :delimiter=>"24"@1596}}, {:control_word=>{:word=>"lang"@1599, :delimiter=>"1033"@1603}}, {:text=>"\n"@1643}, {:control_word=>{:word=>"par"@1645, :delimiter=>" "@1648}}]}
None,即 "I like to read.",被解析了,我不知道为什么。任何指导将不胜感激。
这是因为您缺少所有组。
rule(:group) {
(
str("{") >>
newline.maybe >>
content >>
newline.maybe >>
str("}")
).as(:group)
}
添加'as(:group)'
现在你得到了
{:header=>
{:rtf=>{:version=>"1"@5},
:charset=>"ansi"@7,
:deff=>"\deff3"@11,
:color_table=>nil},
:document=>
[{:control_word=>{:word=>"adeflang"@18, :delimiter=>"1025"@26}},
{:control_word=>{:word=>"n"@31, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"fonttbl"@34, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"f"@43, :delimiter=>"0"@44}},
{:control_word=>{:word=>"froman"@46, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@53, :delimiter=>"2"@57}},
{:control_word=>{:word=>"fcharset"@59, :delimiter=>"0"@67}},
{:text=>" Times New Roman;"@68}]},
{:group=>
[{:control_word=>{:word=>"f"@88, :delimiter=>"1"@89}},
{:control_word=>{:word=>"froman"@91, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@98, :delimiter=>"2"@102}},
{:control_word=>{:word=>"fcharset"@104, :delimiter=>"2"@112}},
{:text=>" Symbol;"@113}]},
{:group=>
[{:control_word=>{:word=>"f"@124, :delimiter=>"2"@125}},
{:control_word=>{:word=>"fswiss"@127, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@134, :delimiter=>"2"@138}},
{:control_word=>{:word=>"fcharset"@140, :delimiter=>"0"@148}},
{:text=>" Arial;"@149}]},
{:group=>
[{:control_word=>{:word=>"f"@159, :delimiter=>"3"@160}},
{:control_word=>{:word=>"froman"@162, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@169, :delimiter=>"2"@173}},
{:control_word=>{:word=>"fcharset"@175, :delimiter=>"128"@183}},
{:text=>" Times New Roman;"@186}]},
{:group=>
[{:control_word=>{:word=>"f"@206, :delimiter=>"4"@207}},
{:control_word=>{:word=>"fswiss"@209, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@216, :delimiter=>"2"@220}},
{:control_word=>{:word=>"fcharset"@222, :delimiter=>"128"@230}},
{:text=>" Arial;"@233}]},
{:group=>
[{:control_word=>{:word=>"f"@243, :delimiter=>"5"@244}},
{:control_word=>{:word=>"fnil"@246, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@251, :delimiter=>"2"@255}},
{:control_word=>{:word=>"fcharset"@257, :delimiter=>"128"@265}},
{:text=>" Droid Sans Fallback;"@268}]},
{:group=>
[{:control_word=>{:word=>"f"@292, :delimiter=>"6"@293}},
{:control_word=>{:word=>"fnil"@295, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@300, :delimiter=>"2"@304}},
{:control_word=>{:word=>"fcharset"@306, :delimiter=>"128"@314}},
{:text=>" DejaVu Sans;"@317}]},
{:group=>
[{:control_word=>{:word=>"f"@333, :delimiter=>"7"@334}},
{:control_word=>{:word=>"fswiss"@336, :delimiter=>nil}},
{:control_word=>{:word=>"fprq"@343, :delimiter=>"0"@347}},
{:control_word=>{:word=>"fcharset"@349, :delimiter=>"128"@357}},
{:text=>" DejaVu Sans;"@360}]}]},
{:control_word=>{:word=>"n"@376, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"colortbl"@379, :delimiter=>";"@387}},
{:control_word=>{:word=>"red"@389, :delimiter=>"0"@392}},
{:control_word=>{:word=>"green"@394, :delimiter=>"0"@399}},
{:control_word=>{:word=>"blue"@401, :delimiter=>"0"@405}},
{:text=>";"@406},
{:control_word=>{:word=>"red"@408, :delimiter=>"128"@411}},
{:control_word=>{:word=>"green"@415, :delimiter=>"128"@420}},
{:control_word=>{:word=>"blue"@424, :delimiter=>"128"@428}},
{:text=>";"@431}]},
{:control_word=>{:word=>"n"@434, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"stylesheet"@437, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@449, :delimiter=>"0"@450}},
{:control_word=>{:word=>"snext"@452, :delimiter=>"0"@457}},
{:control_word=>{:word=>"nowidctlpar"@459, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>[], :delimiter=>nil}},
{:text=>"*"@472},
{:control_word=>{:word=>"hyphen"@474, :delimiter=>"2"@480}},
{:control_word=>{:word=>"hyphlead"@482, :delimiter=>"2"@490}},
{:control_word=>{:word=>"hyphtrail"@492, :delimiter=>"2"@501}},
{:control_word=>{:word=>"hyphmax"@503, :delimiter=>"0"@510}}]},
{:control_word=>{:word=>"cf"@513, :delimiter=>"0"@515}},
{:control_word=>{:word=>"kerning"@517, :delimiter=>"1"@524}},
{:control_word=>{:word=>"hich"@526, :delimiter=>nil}},
{:control_word=>{:word=>"af"@531, :delimiter=>"5"@533}},
{:control_word=>{:word=>"langfe"@535, :delimiter=>"2052"@541}},
{:control_word=>{:word=>"dbch"@546, :delimiter=>nil}},
{:control_word=>{:word=>"af"@551, :delimiter=>"6"@553}},
{:control_word=>{:word=>"afs"@555, :delimiter=>"24"@558}},
{:control_word=>{:word=>"lang"@561, :delimiter=>"1081"@565}},
{:control_word=>{:word=>"loch"@570, :delimiter=>nil}},
{:control_word=>{:word=>"f"@575, :delimiter=>"3"@576}},
{:control_word=>{:word=>"fs"@578, :delimiter=>"24"@580}},
{:control_word=>{:word=>"lang"@583, :delimiter=>"1033"@587}},
{:text=>" Default;"@591}]},
{:control_word=>{:word=>"n"@602, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@605, :delimiter=>"15"@606}},
{:control_word=>{:word=>"sbasedon"@609, :delimiter=>"0"@617}},
{:control_word=>{:word=>"snext"@619, :delimiter=>"16"@624}},
{:control_word=>{:word=>"sb"@627, :delimiter=>"240"@629}},
{:control_word=>{:word=>"sa"@633, :delimiter=>"120"@635}},
{:control_word=>{:word=>"keepn"@639, :delimiter=>nil}},
{:control_word=>{:word=>"hich"@645, :delimiter=>nil}},
{:control_word=>{:word=>"af"@650, :delimiter=>"5"@652}},
{:control_word=>{:word=>"dbch"@654, :delimiter=>nil}},
{:control_word=>{:word=>"af"@659, :delimiter=>"6"@661}},
{:control_word=>{:word=>"afs"@663, :delimiter=>"28"@666}},
{:control_word=>{:word=>"loch"@669, :delimiter=>nil}},
{:control_word=>{:word=>"f"@674, :delimiter=>"4"@675}},
{:control_word=>{:word=>"fs"@677, :delimiter=>"28"@679}},
{:text=>" Heading;"@681}]},
{:control_word=>{:word=>"n"@692, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@695, :delimiter=>"16"@696}},
{:control_word=>{:word=>"sbasedon"@699, :delimiter=>"0"@707}},
{:control_word=>{:word=>"snext"@709, :delimiter=>"16"@714}},
{:control_word=>{:word=>"sb"@717, :delimiter=>"0"@719}},
{:control_word=>{:word=>"sa"@721, :delimiter=>"120"@723}},
{:text=>" Text body;"@726}]},
{:control_word=>{:word=>"n"@739, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@742, :delimiter=>"17"@743}},
{:control_word=>{:word=>"sbasedon"@746, :delimiter=>"16"@754}},
{:control_word=>{:word=>"snext"@757, :delimiter=>"17"@762}},
{:control_word=>{:word=>"sb"@765, :delimiter=>"0"@767}},
{:control_word=>{:word=>"sa"@769, :delimiter=>"120"@771}},
{:control_word=>{:word=>"dbch"@775, :delimiter=>nil}},
{:control_word=>{:word=>"af"@780, :delimiter=>"7"@782}},
{:text=>" List;"@783}]},
{:control_word=>{:word=>"n"@791, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@794, :delimiter=>"18"@795}},
{:control_word=>{:word=>"sbasedon"@798, :delimiter=>"0"@806}},
{:control_word=>{:word=>"snext"@808, :delimiter=>"18"@813}},
{:control_word=>{:word=>"sb"@816, :delimiter=>"120"@818}},
{:control_word=>{:word=>"sa"@822, :delimiter=>"120"@824}},
{:control_word=>{:word=>"noline"@828, :delimiter=>nil}},
{:control_word=>{:word=>"i"@835, :delimiter=>nil}},
{:control_word=>{:word=>"dbch"@837, :delimiter=>nil}},
{:control_word=>{:word=>"af"@842, :delimiter=>"7"@844}},
{:control_word=>{:word=>"afs"@846, :delimiter=>"24"@849}},
{:control_word=>{:word=>"ai"@852, :delimiter=>nil}},
{:control_word=>{:word=>"fs"@855, :delimiter=>"24"@857}},
{:text=>" Caption;"@859}]},
{:control_word=>{:word=>"n"@870, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"s"@873, :delimiter=>"19"@874}},
{:control_word=>{:word=>"sbasedon"@877, :delimiter=>"0"@885}},
{:control_word=>{:word=>"snext"@887, :delimiter=>"19"@892}},
{:control_word=>{:word=>"noline"@895, :delimiter=>nil}},
{:control_word=>{:word=>"dbch"@902, :delimiter=>nil}},
{:control_word=>{:word=>"af"@907, :delimiter=>"7"@909}},
{:text=>" Index;"@910}]},
{:control_word=>{:word=>"n"@919, :delimiter=>nil}}]},
{:group=>
[{:control_word=>{:word=>"info"@923, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"creatim"@929, :delimiter=>nil}},
{:control_word=>{:word=>"yr"@937, :delimiter=>"2018"@939}},
{:control_word=>{:word=>"mo"@944, :delimiter=>"7"@946}},
{:control_word=>{:word=>"dy"@948, :delimiter=>"15"@950}},
{:control_word=>{:word=>"hr"@953, :delimiter=>"11"@955}},
{:control_word=>{:word=>"min"@958, :delimiter=>"52"@961}}]},
{:group=>
[{:control_word=>{:word=>"revtim"@966, :delimiter=>nil}},
{:control_word=>{:word=>"yr"@973, :delimiter=>"0"@975}},
{:control_word=>{:word=>"mo"@977, :delimiter=>"0"@979}},
{:control_word=>{:word=>"dy"@981, :delimiter=>"0"@983}},
{:control_word=>{:word=>"hr"@985, :delimiter=>"0"@987}},
{:control_word=>{:word=>"min"@989, :delimiter=>"0"@992}}]},
{:group=>
[{:control_word=>{:word=>"printim"@996, :delimiter=>nil}},
{:control_word=>{:word=>"yr"@1004, :delimiter=>"0"@1006}},
{:control_word=>{:word=>"mo"@1008, :delimiter=>"0"@1010}},
{:control_word=>{:word=>"dy"@1012, :delimiter=>"0"@1014}},
{:control_word=>{:word=>"hr"@1016, :delimiter=>"0"@1018}},
{:control_word=>{:word=>"min"@1020, :delimiter=>"0"@1023}}]},
{:group=>
[{:control_word=>{:word=>"comment"@1027, :delimiter=>" "@1034}},
{:text=>"OpenOffice"@1035}]},
{:group=>
[{:control_word=>{:word=>"vern"@1048, :delimiter=>"4140"@1052}}]}]},
{:control_word=>{:word=>"deftab"@1059, :delimiter=>"709"@1065}},
{:control_word=>{:word=>"n"@1069, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1071, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>[], :delimiter=>nil}},
{:text=>"*"@1074},
{:control_word=>{:word=>"pgdsctbl"@1076, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1085, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>"pgdsc"@1088, :delimiter=>"0"@1093}},
{:control_word=>{:word=>"pgdscuse"@1095, :delimiter=>"195"@1103}},
{:control_word=>{:word=>"pgwsxn"@1107, :delimiter=>"12240"@1113}},
{:control_word=>{:word=>"pghsxn"@1119, :delimiter=>"15840"@1125}},
{:control_word=>{:word=>"marglsxn"@1131, :delimiter=>"1134"@1139}},
{:control_word=>{:word=>"margrsxn"@1144, :delimiter=>"1134"@1152}},
{:control_word=>{:word=>"margtsxn"@1157, :delimiter=>"1134"@1165}},
{:control_word=>{:word=>"margbsxn"@1170, :delimiter=>"1134"@1178}},
{:control_word=>{:word=>"pgdscnxt"@1183, :delimiter=>"0"@1191}},
{:text=>" Default;"@1192}]}]},
{:control_word=>{:word=>"n"@1204, :delimiter=>nil}},
{:control_word=>{:word=>"formshade"@1206, :delimiter=>nil}},
{:control_word=>{:word=>"paperh"@1216, :delimiter=>"15840"@1222}},
{:control_word=>{:word=>"paperw"@1228, :delimiter=>"12240"@1234}},
{:control_word=>{:word=>"margl"@1240, :delimiter=>"1134"@1245}},
{:control_word=>{:word=>"margr"@1250, :delimiter=>"1134"@1255}},
{:control_word=>{:word=>"margt"@1260, :delimiter=>"1134"@1265}},
{:control_word=>{:word=>"margb"@1270, :delimiter=>"1134"@1275}},
{:control_word=>{:word=>"sectd"@1280, :delimiter=>nil}},
{:control_word=>{:word=>"sbknone"@1286, :delimiter=>nil}},
{:control_word=>{:word=>"sectunlocked"@1294, :delimiter=>"1"@1306}},
{:control_word=>{:word=>"pgndec"@1308, :delimiter=>nil}},
{:control_word=>{:word=>"pgwsxn"@1315, :delimiter=>"12240"@1321}},
{:control_word=>{:word=>"pghsxn"@1327, :delimiter=>"15840"@1333}},
{:control_word=>{:word=>"marglsxn"@1339, :delimiter=>"1134"@1347}},
{:control_word=>{:word=>"margrsxn"@1352, :delimiter=>"1134"@1360}},
{:control_word=>{:word=>"margtsxn"@1365, :delimiter=>"1134"@1373}},
{:control_word=>{:word=>"margbsxn"@1378, :delimiter=>"1134"@1386}},
{:control_word=>{:word=>"ftnbj"@1391, :delimiter=>nil}},
{:control_word=>{:word=>"ftnstart"@1397, :delimiter=>"1"@1405}},
{:control_word=>{:word=>"ftnrstcont"@1407, :delimiter=>nil}},
{:control_word=>{:word=>"ftnnar"@1418, :delimiter=>nil}},
{:control_word=>{:word=>"aenddoc"@1425, :delimiter=>nil}},
{:control_word=>{:word=>"aftnrstcont"@1433, :delimiter=>nil}},
{:control_word=>{:word=>"aftnstart"@1445, :delimiter=>"1"@1454}},
{:control_word=>{:word=>"aftnnrlc"@1456, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1465, :delimiter=>nil}},
{:control_word=>{:word=>"pgndec"@1467, :delimiter=>nil}},
{:control_word=>{:word=>"pard"@1474, :delimiter=>nil}},
{:control_word=>{:word=>"plain"@1479, :delimiter=>" "@1484}},
{:control_word=>{:word=>"s"@1486, :delimiter=>"0"@1487}},
{:control_word=>{:word=>"nowidctlpar"@1489, :delimiter=>nil}},
{:group=>
[{:control_word=>{:word=>[], :delimiter=>nil}},
{:text=>"*"@1502},
{:control_word=>{:word=>"hyphen"@1504, :delimiter=>"2"@1510}},
{:control_word=>{:word=>"hyphlead"@1512, :delimiter=>"2"@1520}},
{:control_word=>{:word=>"hyphtrail"@1522, :delimiter=>"2"@1531}},
{:control_word=>{:word=>"hyphmax"@1533, :delimiter=>"0"@1540}}]},
{:control_word=>{:word=>"cf"@1543, :delimiter=>"0"@1545}},
{:control_word=>{:word=>"kerning"@1547, :delimiter=>"1"@1554}},
{:control_word=>{:word=>"hich"@1556, :delimiter=>nil}},
{:control_word=>{:word=>"af"@1561, :delimiter=>"5"@1563}},
{:control_word=>{:word=>"langfe"@1565, :delimiter=>"2052"@1571}},
{:control_word=>{:word=>"dbch"@1576, :delimiter=>nil}},
{:control_word=>{:word=>"af"@1581, :delimiter=>"6"@1583}},
{:control_word=>{:word=>"afs"@1585, :delimiter=>"24"@1588}},
{:control_word=>{:word=>"lang"@1591, :delimiter=>"1081"@1595}},
{:control_word=>{:word=>"loch"@1600, :delimiter=>nil}},
{:control_word=>{:word=>"f"@1605, :delimiter=>"3"@1606}},
{:control_word=>{:word=>"fs"@1608, :delimiter=>"24"@1610}},
{:control_word=>{:word=>"lang"@1613, :delimiter=>"1033"@1617}},
{:group=>
[{:control_word=>{:word=>"rtlch"@1623, :delimiter=>" "@1628}},
{:control_word=>{:word=>"ltrch"@1630, :delimiter=>nil}},
{:control_word=>{:word=>"loch"@1636, :delimiter=>nil}},
{:control_word=>{:word=>"n"@1641, :delimiter=>nil}},
{:text=>"I like to read."@1642}]},
{:control_word=>{:word=>"n"@1659, :delimiter=>nil}},
{:control_word=>{:word=>"par"@1661, :delimiter=>" "@1664}}]}