| 1 | #! /usr/bin/env python | 
|---|
| 2 | """ | 
|---|
| 3 |   BibTeX to Doxygen converter | 
|---|
| 4 |   Usage: python bib2dox.py bibfile.bib > bibfile.dox | 
|---|
| 5 |  | 
|---|
| 6 |   This file is a part of LEMON, a generic C++ optimization library. | 
|---|
| 7 |  | 
|---|
| 8 |   ********************************************************************** | 
|---|
| 9 |  | 
|---|
| 10 |   This code is the modification of the BibTeX to XML converter | 
|---|
| 11 |   by Vidar Bronken Gundersen et al. | 
|---|
| 12 |   See the original copyright notices below.  | 
|---|
| 13 |  | 
|---|
| 14 |   ********************************************************************** | 
|---|
| 15 |  | 
|---|
| 16 |   Decoder for bibliographic data, BibTeX | 
|---|
| 17 |   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml | 
|---|
| 18 |  | 
|---|
| 19 |   v.8 | 
|---|
| 20 |   (c)2002-06-23 Vidar Bronken Gundersen | 
|---|
| 21 |   http://bibtexml.sf.net/ | 
|---|
| 22 |   Reuse approved as long as this notification is kept. | 
|---|
| 23 |   Licence: GPL. | 
|---|
| 24 |  | 
|---|
| 25 |   Contributions/thanks to: | 
|---|
| 26 |   Egon Willighagen, http://sf.net/projects/jreferences/ | 
|---|
| 27 |   Richard Mahoney (for providing a test case) | 
|---|
| 28 |  | 
|---|
| 29 |   Editted by Sara Sprenkle to be more robust and handle more bibtex features. | 
|---|
| 30 |   (c) 2003-01-15 | 
|---|
| 31 |  | 
|---|
| 32 |   1.  Changed bibtex: tags to bibxml: tags. | 
|---|
| 33 |   2.  Use xmlns:bibxml="http://bibtexml.sf.net/" | 
|---|
| 34 |   3.  Allow spaces between @type and first { | 
|---|
| 35 |   4.  "author" fields with multiple authors split by " and " | 
|---|
| 36 |       are put in separate xml "bibxml:author" tags. | 
|---|
| 37 |   5.  Option for Titles: words are capitalized | 
|---|
| 38 |       only if first letter in title or capitalized inside braces | 
|---|
| 39 |   6.  Removes braces from within field values | 
|---|
| 40 |   7.  Ignores comments in bibtex file (including @comment{ or % ) | 
|---|
| 41 |   8.  Replaces some special latex tags, e.g., replaces ~ with ' ' | 
|---|
| 42 |   9.  Handles bibtex @string abbreviations | 
|---|
| 43 |         --> includes bibtex's default abbreviations for months | 
|---|
| 44 |         --> does concatenation of abbr # " more " and " more " # abbr | 
|---|
| 45 |   10. Handles @type( ... ) or @type{ ... } | 
|---|
| 46 |   11. The keywords field is split on , or ; and put into separate xml | 
|---|
| 47 |       "bibxml:keywords" tags | 
|---|
| 48 |   12. Ignores @preamble | 
|---|
| 49 |  | 
|---|
| 50 |   Known Limitations | 
|---|
| 51 |   1.  Does not transform Latex encoding like math mode and special | 
|---|
| 52 |       latex symbols. | 
|---|
| 53 |   2.  Does not parse author fields into first and last names. | 
|---|
| 54 |       E.g., It does not do anything special to an author whose name is | 
|---|
| 55 |       in the form LAST_NAME, FIRST_NAME | 
|---|
| 56 |       In "author" tag, will show up as | 
|---|
| 57 |       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author> | 
|---|
| 58 |   3.  Does not handle "crossref" fields other than to print | 
|---|
| 59 |       <bibxml:crossref>...</bibxml:crossref> | 
|---|
| 60 |   4.  Does not inform user of the input's format errors.  You just won't | 
|---|
| 61 |       be able to transform the file later with XSL | 
|---|
| 62 |  | 
|---|
| 63 |   You will have to manually edit the XML output if you need to handle | 
|---|
| 64 |   these (and unknown) limitations. | 
|---|
| 65 |  | 
|---|
| 66 | """ | 
|---|
| 67 |  | 
|---|
| 68 | import string, re | 
|---|
| 69 |  | 
|---|
| 70 | # set of valid name characters | 
|---|
| 71 | valid_name_chars = '[\w\-:]' | 
|---|
| 72 |  | 
|---|
| 73 | # | 
|---|
| 74 | # define global regular expression variables | 
|---|
| 75 | # | 
|---|
| 76 | author_rex = re.compile('\s+and\s+') | 
|---|
| 77 | rembraces_rex = re.compile('[{}]') | 
|---|
| 78 | capitalize_rex = re.compile('({[^}]*})') | 
|---|
| 79 |  | 
|---|
| 80 | # used by bibtexkeywords(data) | 
|---|
| 81 | keywords_rex = re.compile('[,;]') | 
|---|
| 82 |  | 
|---|
| 83 | # used by concat_line(line) | 
|---|
| 84 | concatsplit_rex = re.compile('\s*#\s*') | 
|---|
| 85 |  | 
|---|
| 86 | # split on {, }, or " in verify_out_of_braces | 
|---|
| 87 | delimiter_rex = re.compile('([{}"])',re.I) | 
|---|
| 88 |  | 
|---|
| 89 | field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') | 
|---|
| 90 | data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') | 
|---|
| 91 |  | 
|---|
| 92 | url_rex = re.compile('\\\url\{([^}]*)\}') | 
|---|
| 93 |  | 
|---|
| 94 | # | 
|---|
| 95 | # styles for html formatting | 
|---|
| 96 | # | 
|---|
| 97 | divstyle = 'margin-top: -4ex; margin-left: 8em;' | 
|---|
| 98 |  | 
|---|
| 99 | # | 
|---|
| 100 | # return the string parameter without braces | 
|---|
| 101 | # | 
|---|
| 102 | def transformurls(str): | 
|---|
| 103 |     return url_rex.sub(r'<a href="\1">\1</a>', str) | 
|---|
| 104 |  | 
|---|
| 105 | # | 
|---|
| 106 | # return the string parameter without braces | 
|---|
| 107 | # | 
|---|
| 108 | def removebraces(str): | 
|---|
| 109 |     return rembraces_rex.sub('', str) | 
|---|
| 110 |  | 
|---|
| 111 | # | 
|---|
| 112 | # latex-specific replacements | 
|---|
| 113 | # (do this after braces were removed) | 
|---|
| 114 | # | 
|---|
| 115 | def latexreplacements(line): | 
|---|
| 116 |     line = string.replace(line, '~', ' ') | 
|---|
| 117 |     line = string.replace(line, '\\\'a', 'á') | 
|---|
| 118 |     line = string.replace(line, '\\"a', 'ä') | 
|---|
| 119 |     line = string.replace(line, '\\\'e', 'é') | 
|---|
| 120 |     line = string.replace(line, '\\"e', 'ë') | 
|---|
| 121 |     line = string.replace(line, '\\\'i', 'í') | 
|---|
| 122 |     line = string.replace(line, '\\"i', 'ï') | 
|---|
| 123 |     line = string.replace(line, '\\\'o', 'ó') | 
|---|
| 124 |     line = string.replace(line, '\\"o', 'ö') | 
|---|
| 125 |     line = string.replace(line, '\\\'u', 'ú') | 
|---|
| 126 |     line = string.replace(line, '\\"u', 'ü') | 
|---|
| 127 |     line = string.replace(line, '\\H o', 'õ') | 
|---|
| 128 |     line = string.replace(line, '\\H u', 'ü')   # ũ does not exist | 
|---|
| 129 |     line = string.replace(line, '\\\'A', 'Á') | 
|---|
| 130 |     line = string.replace(line, '\\"A', 'Ä') | 
|---|
| 131 |     line = string.replace(line, '\\\'E', 'É') | 
|---|
| 132 |     line = string.replace(line, '\\"E', 'Ë') | 
|---|
| 133 |     line = string.replace(line, '\\\'I', 'Í') | 
|---|
| 134 |     line = string.replace(line, '\\"I', 'Ï') | 
|---|
| 135 |     line = string.replace(line, '\\\'O', 'Ó') | 
|---|
| 136 |     line = string.replace(line, '\\"O', 'Ö') | 
|---|
| 137 |     line = string.replace(line, '\\\'U', 'Ú') | 
|---|
| 138 |     line = string.replace(line, '\\"U', 'Ü') | 
|---|
| 139 |     line = string.replace(line, '\\H O', 'Õ') | 
|---|
| 140 |     line = string.replace(line, '\\H U', 'Ü')   # Ũ does not exist | 
|---|
| 141 |  | 
|---|
| 142 |     return line | 
|---|
| 143 |  | 
|---|
| 144 | # | 
|---|
| 145 | # copy characters form a string decoding html expressions (&xyz;) | 
|---|
| 146 | # | 
|---|
| 147 | def copychars(str, ifrom, count): | 
|---|
| 148 |     result = '' | 
|---|
| 149 |     i = ifrom | 
|---|
| 150 |     c = 0 | 
|---|
| 151 |     html_spec = False | 
|---|
| 152 |     while (i < len(str)) and (c < count): | 
|---|
| 153 |         if str[i] == '&': | 
|---|
| 154 |             html_spec = True; | 
|---|
| 155 |             if i+1 < len(str): | 
|---|
| 156 |                 result += str[i+1] | 
|---|
| 157 |             c += 1 | 
|---|
| 158 |             i += 2 | 
|---|
| 159 |         else: | 
|---|
| 160 |             if not html_spec: | 
|---|
| 161 |                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ | 
|---|
| 162 |                    ((str[i] >= 'a') and (str[i] <= 'z')): | 
|---|
| 163 |                     result += str[i] | 
|---|
| 164 |                     c += 1 | 
|---|
| 165 |             elif str[i] == ';': | 
|---|
| 166 |                 html_spec = False; | 
|---|
| 167 |             i += 1 | 
|---|
| 168 |      | 
|---|
| 169 |     return result | 
|---|
| 170 |  | 
|---|
| 171 |  | 
|---|
| 172 | #  | 
|---|
| 173 | # Handle a list of authors (separated by 'and'). | 
|---|
| 174 | # It gives back an array of the follwing values: | 
|---|
| 175 | #  - num: the number of authors, | 
|---|
| 176 | #  - list: the list of the author names, | 
|---|
| 177 | #  - text: the bibtex text (separated by commas and/or 'and') | 
|---|
| 178 | #  - abbrev: abbreviation that can be used for indicate the | 
|---|
| 179 | #    bibliography entries | 
|---|
| 180 | # | 
|---|
| 181 | def bibtexauthor(data): | 
|---|
| 182 |     result = {} | 
|---|
| 183 |     bibtex = '' | 
|---|
| 184 |     result['list'] = author_rex.split(data) | 
|---|
| 185 |     result['num'] = len(result['list']) | 
|---|
| 186 |     for i, author in enumerate(result['list']): | 
|---|
| 187 |         # general transformations | 
|---|
| 188 |         author = latexreplacements(removebraces(author.strip())) | 
|---|
| 189 |         # transform "Xyz, A. B." to "A. B. Xyz" | 
|---|
| 190 |         pos = author.find(',') | 
|---|
| 191 |         if pos != -1: | 
|---|
| 192 |             author = author[pos+1:].strip() + ' ' + author[:pos].strip() | 
|---|
| 193 |         result['list'][i] = author | 
|---|
| 194 |         bibtex += author + '#' | 
|---|
| 195 |     bibtex = bibtex[:-1] | 
|---|
| 196 |     if result['num'] > 1: | 
|---|
| 197 |         ix = bibtex.rfind('#') | 
|---|
| 198 |         if result['num'] == 2: | 
|---|
| 199 |             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] | 
|---|
| 200 |         else: | 
|---|
| 201 |             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] | 
|---|
| 202 |     bibtex = bibtex.replace('#', ', ') | 
|---|
| 203 |     result['text'] = bibtex | 
|---|
| 204 |      | 
|---|
| 205 |     result['abbrev'] = '' | 
|---|
| 206 |     for author in result['list']: | 
|---|
| 207 |         pos = author.rfind(' ') + 1 | 
|---|
| 208 |         count = 1 | 
|---|
| 209 |         if result['num'] == 1: | 
|---|
| 210 |             count = 3 | 
|---|
| 211 |         result['abbrev'] += copychars(author, pos, count) | 
|---|
| 212 |  | 
|---|
| 213 |     return result | 
|---|
| 214 |  | 
|---|
| 215 |  | 
|---|
| 216 | # | 
|---|
| 217 | # data = title string | 
|---|
| 218 | # @return the capitalized title (first letter is capitalized), rest are capitalized | 
|---|
| 219 | # only if capitalized inside braces | 
|---|
| 220 | # | 
|---|
| 221 | def capitalizetitle(data): | 
|---|
| 222 |     title_list = capitalize_rex.split(data) | 
|---|
| 223 |     title = '' | 
|---|
| 224 |     count = 0 | 
|---|
| 225 |     for phrase in title_list: | 
|---|
| 226 |          check = string.lstrip(phrase) | 
|---|
| 227 |  | 
|---|
| 228 |          # keep phrase's capitalization the same | 
|---|
| 229 |          if check.find('{') == 0: | 
|---|
| 230 |               title += removebraces(phrase) | 
|---|
| 231 |          else: | 
|---|
| 232 |          # first word --> capitalize first letter (after spaces) | 
|---|
| 233 |               if count == 0: | 
|---|
| 234 |                   title += check.capitalize() | 
|---|
| 235 |               else: | 
|---|
| 236 |                   title += phrase.lower() | 
|---|
| 237 |          count = count + 1 | 
|---|
| 238 |  | 
|---|
| 239 |     return title | 
|---|
| 240 |  | 
|---|
| 241 |  | 
|---|
| 242 | # | 
|---|
| 243 | # @return the bibtex for the title | 
|---|
| 244 | # @param data --> title string | 
|---|
| 245 | # braces are removed from title | 
|---|
| 246 | # | 
|---|
| 247 | def bibtextitle(data, entrytype): | 
|---|
| 248 |     if entrytype in ('book', 'inbook'): | 
|---|
| 249 |         title = removebraces(data.strip()) | 
|---|
| 250 |     else: | 
|---|
| 251 |         title = removebraces(capitalizetitle(data.strip())) | 
|---|
| 252 |     bibtex = title | 
|---|
| 253 |     return bibtex | 
|---|
| 254 |  | 
|---|
| 255 |  | 
|---|
| 256 | # | 
|---|
| 257 | # function to compare entry lists | 
|---|
| 258 | # | 
|---|
| 259 | def entry_cmp(x, y): | 
|---|
| 260 |     return cmp(x[0], y[0]) | 
|---|
| 261 |  | 
|---|
| 262 |  | 
|---|
| 263 | # | 
|---|
| 264 | # print the XML for the transformed "filecont_source" | 
|---|
| 265 | # | 
|---|
| 266 | def bibtexdecoder(filecont_source): | 
|---|
| 267 |     filecont = [] | 
|---|
| 268 |     file = [] | 
|---|
| 269 |      | 
|---|
| 270 |     # want @<alphanumeric chars><spaces>{<spaces><any chars>, | 
|---|
| 271 |     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') | 
|---|
| 272 |     endtype_rex = re.compile('}\s*$') | 
|---|
| 273 |     endtag_rex = re.compile('^\s*}\s*$') | 
|---|
| 274 |  | 
|---|
| 275 |     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') | 
|---|
| 276 |     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') | 
|---|
| 277 |  | 
|---|
| 278 |     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') | 
|---|
| 279 |     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') | 
|---|
| 280 |  | 
|---|
| 281 |     for line in filecont_source: | 
|---|
| 282 |         line = line[:-1] | 
|---|
| 283 |  | 
|---|
| 284 |         # encode character entities | 
|---|
| 285 |         line = string.replace(line, '&', '&') | 
|---|
| 286 |         line = string.replace(line, '<', '<') | 
|---|
| 287 |         line = string.replace(line, '>', '>') | 
|---|
| 288 |  | 
|---|
| 289 |         # start entry: publication type (store for later use) | 
|---|
| 290 |         if pubtype_rex.match(line): | 
|---|
| 291 |         # want @<alphanumeric chars><spaces>{<spaces><any chars>, | 
|---|
| 292 |             entrycont = {} | 
|---|
| 293 |             entry = [] | 
|---|
| 294 |             entrytype = pubtype_rex.sub('\g<1>',line) | 
|---|
| 295 |             entrytype = string.lower(entrytype) | 
|---|
| 296 |             entryid   = pubtype_rex.sub('\g<2>', line) | 
|---|
| 297 |  | 
|---|
| 298 |         # end entry if just a } | 
|---|
| 299 |         elif endtype_rex.match(line): | 
|---|
| 300 |             # generate doxygen code for the entry | 
|---|
| 301 |  | 
|---|
| 302 |             # enty type related formattings | 
|---|
| 303 |             if entrytype in ('book', 'inbook'): | 
|---|
| 304 |                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>' | 
|---|
| 305 |                 if not entrycont.has_key('author'): | 
|---|
| 306 |                     entrycont['author'] = entrycont['editor'] | 
|---|
| 307 |                     entrycont['author']['text'] += ', editors' | 
|---|
| 308 |             elif entrytype == 'article': | 
|---|
| 309 |                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>' | 
|---|
| 310 |             elif entrytype in ('inproceedings', 'incollection', 'conference'): | 
|---|
| 311 |                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>' | 
|---|
| 312 |             elif entrytype == 'techreport': | 
|---|
| 313 |                 if not entrycont.has_key('type'): | 
|---|
| 314 |                     entrycont['type'] = 'Technical report' | 
|---|
| 315 |             elif entrytype == 'mastersthesis': | 
|---|
| 316 |                 entrycont['type'] = 'Master\'s thesis' | 
|---|
| 317 |             elif entrytype == 'phdthesis': | 
|---|
| 318 |                 entrycont['type'] = 'PhD thesis' | 
|---|
| 319 |  | 
|---|
| 320 |             for eline in entrycont: | 
|---|
| 321 |                 if eline != '': | 
|---|
| 322 |                     eline = latexreplacements(eline) | 
|---|
| 323 |  | 
|---|
| 324 |             if entrycont.has_key('pages') and (entrycont['pages'] != ''): | 
|---|
| 325 |                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') | 
|---|
| 326 |  | 
|---|
| 327 |             if entrycont.has_key('author') and (entrycont['author'] != ''): | 
|---|
| 328 |                 entry.append(entrycont['author']['text'] + '.') | 
|---|
| 329 |             if entrycont.has_key('title') and (entrycont['title'] != ''): | 
|---|
| 330 |                 entry.append(entrycont['title'] + '.') | 
|---|
| 331 |             if entrycont.has_key('journal') and (entrycont['journal'] != ''): | 
|---|
| 332 |                 entry.append(entrycont['journal'] + ',') | 
|---|
| 333 |             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): | 
|---|
| 334 |                 entry.append('In ' + entrycont['booktitle'] + ',') | 
|---|
| 335 |             if entrycont.has_key('type') and (entrycont['type'] != ''): | 
|---|
| 336 |                 eline = entrycont['type'] | 
|---|
| 337 |                 if entrycont.has_key('number') and (entrycont['number'] != ''): | 
|---|
| 338 |                     eline += ' ' + entrycont['number'] | 
|---|
| 339 |                 eline += ',' | 
|---|
| 340 |                 entry.append(eline) | 
|---|
| 341 |             if entrycont.has_key('institution') and (entrycont['institution'] != ''): | 
|---|
| 342 |                 entry.append(entrycont['institution'] + ',') | 
|---|
| 343 |             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): | 
|---|
| 344 |                 entry.append(entrycont['publisher'] + ',') | 
|---|
| 345 |             if entrycont.has_key('school') and (entrycont['school'] != ''): | 
|---|
| 346 |                 entry.append(entrycont['school'] + ',') | 
|---|
| 347 |             if entrycont.has_key('address') and (entrycont['address'] != ''): | 
|---|
| 348 |                 entry.append(entrycont['address'] + ',') | 
|---|
| 349 |             if entrycont.has_key('edition') and (entrycont['edition'] != ''): | 
|---|
| 350 |                 entry.append(entrycont['edition'] + ' edition,') | 
|---|
| 351 |             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): | 
|---|
| 352 |                 entry.append(entrycont['howpublished'] + ',') | 
|---|
| 353 |             if entrycont.has_key('volume') and (entrycont['volume'] != ''): | 
|---|
| 354 |                 eline = entrycont['volume']; | 
|---|
| 355 |                 if entrycont.has_key('number') and (entrycont['number'] != ''): | 
|---|
| 356 |                     eline += '(' + entrycont['number'] + ')' | 
|---|
| 357 |                 if entrycont.has_key('pages') and (entrycont['pages'] != ''): | 
|---|
| 358 |                     eline += ':' + entrycont['pages'] | 
|---|
| 359 |                 eline += ',' | 
|---|
| 360 |                 entry.append(eline) | 
|---|
| 361 |             else: | 
|---|
| 362 |                 if entrycont.has_key('pages') and (entrycont['pages'] != ''): | 
|---|
| 363 |                     entry.append('pages ' + entrycont['pages'] + ',') | 
|---|
| 364 |             if entrycont.has_key('year') and (entrycont['year'] != ''): | 
|---|
| 365 |                 if entrycont.has_key('month') and (entrycont['month'] != ''): | 
|---|
| 366 |                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') | 
|---|
| 367 |                 else: | 
|---|
| 368 |                     entry.append(entrycont['year'] + '.') | 
|---|
| 369 |             if entrycont.has_key('note') and (entrycont['note'] != ''): | 
|---|
| 370 |                 entry.append(entrycont['note'] + '.') | 
|---|
| 371 |             if entrycont.has_key('url') and (entrycont['url'] != ''): | 
|---|
| 372 |                 entry.append(entrycont['url'] + '.') | 
|---|
| 373 |  | 
|---|
| 374 |             # generate keys for sorting and for the output | 
|---|
| 375 |             sortkey = '' | 
|---|
| 376 |             bibkey = '' | 
|---|
| 377 |             if entrycont.has_key('author'): | 
|---|
| 378 |                 for author in entrycont['author']['list']: | 
|---|
| 379 |                     sortkey += copychars(author, author.rfind(' ')+1, len(author)) | 
|---|
| 380 |                 bibkey = entrycont['author']['abbrev'] | 
|---|
| 381 |             else: | 
|---|
| 382 |                 bibkey = 'x' | 
|---|
| 383 |             if entrycont.has_key('year'): | 
|---|
| 384 |                 sortkey += entrycont['year'] | 
|---|
| 385 |                 bibkey += entrycont['year'][-2:] | 
|---|
| 386 |             if entrycont.has_key('title'): | 
|---|
| 387 |                 sortkey += entrycont['title'] | 
|---|
| 388 |             if entrycont.has_key('key'): | 
|---|
| 389 |                 sortkey = entrycont['key'] + sortkey | 
|---|
| 390 |                 bibkey = entrycont['key'] | 
|---|
| 391 |             entry.insert(0, sortkey) | 
|---|
| 392 |             entry.insert(1, bibkey) | 
|---|
| 393 |             entry.insert(2, entryid) | 
|---|
| 394 |             | 
|---|
| 395 |             # add the entry to the file contents | 
|---|
| 396 |             filecont.append(entry) | 
|---|
| 397 |  | 
|---|
| 398 |         else: | 
|---|
| 399 |             # field, publication info | 
|---|
| 400 |             field = '' | 
|---|
| 401 |             data = '' | 
|---|
| 402 |              | 
|---|
| 403 |             # field = {data} entries | 
|---|
| 404 |             if bracedata_rex.match(line): | 
|---|
| 405 |                 field = bracefield_rex.sub('\g<1>', line) | 
|---|
| 406 |                 field = string.lower(field) | 
|---|
| 407 |                 data =  bracedata_rex.sub('\g<2>', line) | 
|---|
| 408 |  | 
|---|
| 409 |             # field = "data" entries | 
|---|
| 410 |             elif quotedata_rex.match(line): | 
|---|
| 411 |                 field = quotefield_rex.sub('\g<1>', line) | 
|---|
| 412 |                 field = string.lower(field) | 
|---|
| 413 |                 data =  quotedata_rex.sub('\g<2>', line) | 
|---|
| 414 |  | 
|---|
| 415 |             # field = data entries | 
|---|
| 416 |             elif data_rex.match(line): | 
|---|
| 417 |                 field = field_rex.sub('\g<1>', line) | 
|---|
| 418 |                 field = string.lower(field) | 
|---|
| 419 |                 data =  data_rex.sub('\g<2>', line) | 
|---|
| 420 |  | 
|---|
| 421 |             if field == 'url': | 
|---|
| 422 |                 data = '\\url{' + data.strip() + '}' | 
|---|
| 423 |              | 
|---|
| 424 |             if field in ('author', 'editor'): | 
|---|
| 425 |                 entrycont[field] = bibtexauthor(data) | 
|---|
| 426 |                 line = '' | 
|---|
| 427 |             elif field == 'title': | 
|---|
| 428 |                 line = bibtextitle(data, entrytype) | 
|---|
| 429 |             elif field != '': | 
|---|
| 430 |                 line = removebraces(transformurls(data.strip())) | 
|---|
| 431 |  | 
|---|
| 432 |             if line != '': | 
|---|
| 433 |                 line = latexreplacements(line) | 
|---|
| 434 |                 entrycont[field] = line | 
|---|
| 435 |  | 
|---|
| 436 |  | 
|---|
| 437 |     # sort entries | 
|---|
| 438 |     filecont.sort(entry_cmp) | 
|---|
| 439 |      | 
|---|
| 440 |     # count the bibtex keys | 
|---|
| 441 |     keytable = {} | 
|---|
| 442 |     counttable = {} | 
|---|
| 443 |     for entry in filecont: | 
|---|
| 444 |         bibkey = entry[1] | 
|---|
| 445 |         if not keytable.has_key(bibkey): | 
|---|
| 446 |             keytable[bibkey] = 1 | 
|---|
| 447 |         else: | 
|---|
| 448 |             keytable[bibkey] += 1 | 
|---|
| 449 |  | 
|---|
| 450 |     for bibkey in keytable.keys(): | 
|---|
| 451 |         counttable[bibkey] = 0 | 
|---|
| 452 |      | 
|---|
| 453 |     # generate output | 
|---|
| 454 |     for entry in filecont: | 
|---|
| 455 |         # generate output key form the bibtex key | 
|---|
| 456 |         bibkey = entry[1] | 
|---|
| 457 |         entryid = entry[2] | 
|---|
| 458 |         if keytable[bibkey] == 1: | 
|---|
| 459 |             outkey = bibkey | 
|---|
| 460 |         else: | 
|---|
| 461 |             outkey = bibkey + chr(97 + counttable[bibkey]) | 
|---|
| 462 |         counttable[bibkey] += 1 | 
|---|
| 463 |          | 
|---|
| 464 |         # append the entry code to the output | 
|---|
| 465 |         file.append('\\section ' + entryid + ' [' + outkey + ']') | 
|---|
| 466 |         file.append('<div style="' + divstyle + '">') | 
|---|
| 467 |         for line in entry[3:]: | 
|---|
| 468 |             file.append(line) | 
|---|
| 469 |         file.append('</div>') | 
|---|
| 470 |         file.append('') | 
|---|
| 471 |  | 
|---|
| 472 |     return file | 
|---|
| 473 |  | 
|---|
| 474 |  | 
|---|
| 475 | # | 
|---|
| 476 | # return 1 iff abbr is in line but not inside braces or quotes | 
|---|
| 477 | # assumes that abbr appears only once on the line (out of braces and quotes) | 
|---|
| 478 | # | 
|---|
| 479 | def verify_out_of_braces(line, abbr): | 
|---|
| 480 |  | 
|---|
| 481 |     phrase_split = delimiter_rex.split(line) | 
|---|
| 482 |  | 
|---|
| 483 |     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) | 
|---|
| 484 |  | 
|---|
| 485 |     open_brace = 0 | 
|---|
| 486 |     open_quote = 0 | 
|---|
| 487 |  | 
|---|
| 488 |     for phrase in phrase_split: | 
|---|
| 489 |         if phrase == "{": | 
|---|
| 490 |             open_brace = open_brace + 1 | 
|---|
| 491 |         elif phrase == "}": | 
|---|
| 492 |             open_brace = open_brace - 1 | 
|---|
| 493 |         elif phrase == '"': | 
|---|
| 494 |             if open_quote == 1: | 
|---|
| 495 |                 open_quote = 0 | 
|---|
| 496 |             else: | 
|---|
| 497 |                 open_quote = 1 | 
|---|
| 498 |         elif abbr_rex.search(phrase): | 
|---|
| 499 |             if open_brace == 0 and open_quote == 0: | 
|---|
| 500 |                 return 1 | 
|---|
| 501 |  | 
|---|
| 502 |     return 0 | 
|---|
| 503 |  | 
|---|
| 504 |  | 
|---|
| 505 | # | 
|---|
| 506 | # a line in the form phrase1 # phrase2 # ... # phrasen | 
|---|
| 507 | # is returned as phrase1 phrase2 ... phrasen | 
|---|
| 508 | # with the correct punctuation | 
|---|
| 509 | # Bug: Doesn't always work with multiple abbreviations plugged in | 
|---|
| 510 | # | 
|---|
| 511 | def concat_line(line): | 
|---|
| 512 |     # only look at part after equals | 
|---|
| 513 |     field = field_rex.sub('\g<1>',line) | 
|---|
| 514 |     rest = field_rex.sub('\g<2>',line) | 
|---|
| 515 |  | 
|---|
| 516 |     concat_line = field + ' =' | 
|---|
| 517 |  | 
|---|
| 518 |     pound_split = concatsplit_rex.split(rest) | 
|---|
| 519 |  | 
|---|
| 520 |     phrase_count = 0 | 
|---|
| 521 |     length = len(pound_split) | 
|---|
| 522 |  | 
|---|
| 523 |     for phrase in pound_split: | 
|---|
| 524 |         phrase = phrase.strip() | 
|---|
| 525 |         if phrase_count != 0: | 
|---|
| 526 |             if phrase.startswith('"') or phrase.startswith('{'): | 
|---|
| 527 |                 phrase = phrase[1:] | 
|---|
| 528 |         elif phrase.startswith('"'): | 
|---|
| 529 |             phrase = phrase.replace('"','{',1) | 
|---|
| 530 |  | 
|---|
| 531 |         if phrase_count != length-1: | 
|---|
| 532 |             if phrase.endswith('"') or phrase.endswith('}'): | 
|---|
| 533 |                 phrase = phrase[:-1] | 
|---|
| 534 |         else: | 
|---|
| 535 |             if phrase.endswith('"'): | 
|---|
| 536 |                 phrase = phrase[:-1] | 
|---|
| 537 |                 phrase = phrase + "}" | 
|---|
| 538 |             elif phrase.endswith('",'): | 
|---|
| 539 |                 phrase = phrase[:-2] | 
|---|
| 540 |                 phrase = phrase + "}," | 
|---|
| 541 |  | 
|---|
| 542 |         # if phrase did have \#, add the \# back | 
|---|
| 543 |         if phrase.endswith('\\'): | 
|---|
| 544 |             phrase = phrase + "#" | 
|---|
| 545 |         concat_line = concat_line + ' ' + phrase | 
|---|
| 546 |  | 
|---|
| 547 |         phrase_count = phrase_count + 1 | 
|---|
| 548 |  | 
|---|
| 549 |     return concat_line | 
|---|
| 550 |  | 
|---|
| 551 |  | 
|---|
| 552 | # | 
|---|
| 553 | # substitute abbreviations into filecont | 
|---|
| 554 | # @param filecont_source - string of data from file | 
|---|
| 555 | # | 
|---|
| 556 | def bibtex_replace_abbreviations(filecont_source): | 
|---|
| 557 |     filecont = filecont_source.splitlines() | 
|---|
| 558 |  | 
|---|
| 559 |     #  These are defined in bibtex, so we'll define them too | 
|---|
| 560 |     abbr_list = ['jan','feb','mar','apr','may','jun', | 
|---|
| 561 |                  'jul','aug','sep','oct','nov','dec'] | 
|---|
| 562 |     value_list = ['January','February','March','April', | 
|---|
| 563 |                   'May','June','July','August','September', | 
|---|
| 564 |                   'October','November','December'] | 
|---|
| 565 |  | 
|---|
| 566 |     abbr_rex = [] | 
|---|
| 567 |     total_abbr_count = 0 | 
|---|
| 568 |  | 
|---|
| 569 |     front = '\\b' | 
|---|
| 570 |     back = '(,?)\\b' | 
|---|
| 571 |  | 
|---|
| 572 |     for x in abbr_list: | 
|---|
| 573 |         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) | 
|---|
| 574 |         total_abbr_count = total_abbr_count + 1 | 
|---|
| 575 |  | 
|---|
| 576 |  | 
|---|
| 577 |     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', | 
|---|
| 578 |                              re.I) | 
|---|
| 579 |  | 
|---|
| 580 |     comment_rex = re.compile('@comment\s*{',re.I) | 
|---|
| 581 |     preamble_rex = re.compile('@preamble\s*{',re.I) | 
|---|
| 582 |  | 
|---|
| 583 |     waiting_for_end_string = 0 | 
|---|
| 584 |     i = 0 | 
|---|
| 585 |     filecont2 = '' | 
|---|
| 586 |  | 
|---|
| 587 |     for line in filecont: | 
|---|
| 588 |         if line == ' ' or line == '': | 
|---|
| 589 |             continue | 
|---|
| 590 |  | 
|---|
| 591 |         if waiting_for_end_string: | 
|---|
| 592 |             if re.search('}',line): | 
|---|
| 593 |                 waiting_for_end_string = 0 | 
|---|
| 594 |                 continue | 
|---|
| 595 |  | 
|---|
| 596 |         if abbrdef_rex.search(line): | 
|---|
| 597 |             abbr = abbrdef_rex.sub('\g<1>', line) | 
|---|
| 598 |  | 
|---|
| 599 |             if abbr_list.count(abbr) == 0: | 
|---|
| 600 |                 val = abbrdef_rex.sub('\g<2>', line) | 
|---|
| 601 |                 abbr_list.append(abbr) | 
|---|
| 602 |                 value_list.append(string.strip(val)) | 
|---|
| 603 |                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) | 
|---|
| 604 |                 total_abbr_count = total_abbr_count + 1 | 
|---|
| 605 |             waiting_for_end_string = 1 | 
|---|
| 606 |             continue | 
|---|
| 607 |  | 
|---|
| 608 |         if comment_rex.search(line): | 
|---|
| 609 |             waiting_for_end_string = 1 | 
|---|
| 610 |             continue | 
|---|
| 611 |  | 
|---|
| 612 |         if preamble_rex.search(line): | 
|---|
| 613 |             waiting_for_end_string = 1 | 
|---|
| 614 |             continue | 
|---|
| 615 |  | 
|---|
| 616 |  | 
|---|
| 617 |         # replace subsequent abbreviations with the value | 
|---|
| 618 |         abbr_count = 0 | 
|---|
| 619 |  | 
|---|
| 620 |         for x in abbr_list: | 
|---|
| 621 |  | 
|---|
| 622 |             if abbr_rex[abbr_count].search(line): | 
|---|
| 623 |                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: | 
|---|
| 624 |                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) | 
|---|
| 625 |                 # Check for # concatenations | 
|---|
| 626 |                 if concatsplit_rex.search(line): | 
|---|
| 627 |                     line = concat_line(line) | 
|---|
| 628 |             abbr_count = abbr_count + 1 | 
|---|
| 629 |  | 
|---|
| 630 |  | 
|---|
| 631 |         filecont2 = filecont2 + line + '\n' | 
|---|
| 632 |         i = i+1 | 
|---|
| 633 |  | 
|---|
| 634 |  | 
|---|
| 635 |     # Do one final pass over file | 
|---|
| 636 |  | 
|---|
| 637 |     # make sure that didn't end up with {" or }" after the substitution | 
|---|
| 638 |     filecont2 = filecont2.replace('{"','{{') | 
|---|
| 639 |     filecont2 = filecont2.replace('"}','}}') | 
|---|
| 640 |  | 
|---|
| 641 |     afterquotevalue_rex = re.compile('"\s*,\s*') | 
|---|
| 642 |     afterbrace_rex = re.compile('"\s*}') | 
|---|
| 643 |     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') | 
|---|
| 644 |  | 
|---|
| 645 |     # add new lines to data that changed because of abbreviation substitutions | 
|---|
| 646 |     filecont2 = afterquotevalue_rex.sub('",\n', filecont2) | 
|---|
| 647 |     filecont2 = afterbrace_rex.sub('"\n}', filecont2) | 
|---|
| 648 |     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) | 
|---|
| 649 |  | 
|---|
| 650 |     return filecont2 | 
|---|
| 651 |  | 
|---|
| 652 | # | 
|---|
| 653 | # convert @type( ... ) to @type{ ... } | 
|---|
| 654 | # | 
|---|
| 655 | def no_outer_parens(filecont): | 
|---|
| 656 |  | 
|---|
| 657 |     # do checking for open parens | 
|---|
| 658 |     # will convert to braces | 
|---|
| 659 |     paren_split = re.split('([(){}])',filecont) | 
|---|
| 660 |  | 
|---|
| 661 |     open_paren_count = 0 | 
|---|
| 662 |     open_type = 0 | 
|---|
| 663 |     look_next = 0 | 
|---|
| 664 |  | 
|---|
| 665 |     # rebuild filecont | 
|---|
| 666 |     filecont = '' | 
|---|
| 667 |  | 
|---|
| 668 |     at_rex = re.compile('@\w*') | 
|---|
| 669 |  | 
|---|
| 670 |     for phrase in paren_split: | 
|---|
| 671 |         if look_next == 1: | 
|---|
| 672 |             if phrase == '(': | 
|---|
| 673 |                 phrase = '{' | 
|---|
| 674 |                 open_paren_count = open_paren_count + 1 | 
|---|
| 675 |             else: | 
|---|
| 676 |                 open_type = 0 | 
|---|
| 677 |             look_next = 0 | 
|---|
| 678 |  | 
|---|
| 679 |         if phrase == '(': | 
|---|
| 680 |             open_paren_count = open_paren_count + 1 | 
|---|
| 681 |  | 
|---|
| 682 |         elif phrase == ')': | 
|---|
| 683 |             open_paren_count = open_paren_count - 1 | 
|---|
| 684 |             if open_type == 1 and open_paren_count == 0: | 
|---|
| 685 |                 phrase = '}' | 
|---|
| 686 |                 open_type = 0 | 
|---|
| 687 |  | 
|---|
| 688 |         elif at_rex.search( phrase ): | 
|---|
| 689 |             open_type = 1 | 
|---|
| 690 |             look_next = 1 | 
|---|
| 691 |  | 
|---|
| 692 |         filecont = filecont + phrase | 
|---|
| 693 |  | 
|---|
| 694 |     return filecont | 
|---|
| 695 |  | 
|---|
| 696 |  | 
|---|
| 697 | # | 
|---|
| 698 | # make all whitespace into just one space | 
|---|
| 699 | # format the bibtex file into a usable form. | 
|---|
| 700 | # | 
|---|
| 701 | def bibtexwasher(filecont_source): | 
|---|
| 702 |  | 
|---|
| 703 |     space_rex = re.compile('\s+') | 
|---|
| 704 |     comment_rex = re.compile('\s*%') | 
|---|
| 705 |  | 
|---|
| 706 |     filecont = [] | 
|---|
| 707 |  | 
|---|
| 708 |     # remove trailing and excessive whitespace | 
|---|
| 709 |     # ignore comments | 
|---|
| 710 |     for line in filecont_source: | 
|---|
| 711 |         line = string.strip(line) | 
|---|
| 712 |         line = space_rex.sub(' ', line) | 
|---|
| 713 |         # ignore comments | 
|---|
| 714 |         if not comment_rex.match(line) and line != '': | 
|---|
| 715 |             filecont.append(' '+ line) | 
|---|
| 716 |  | 
|---|
| 717 |     filecont = string.join(filecont, '') | 
|---|
| 718 |  | 
|---|
| 719 |     # the file is in one long string | 
|---|
| 720 |  | 
|---|
| 721 |     filecont = no_outer_parens(filecont) | 
|---|
| 722 |  | 
|---|
| 723 |     # | 
|---|
| 724 |     # split lines according to preferred syntax scheme | 
|---|
| 725 |     # | 
|---|
| 726 |     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) | 
|---|
| 727 |  | 
|---|
| 728 |     # add new lines after commas that are after values | 
|---|
| 729 |     filecont = re.sub('"\s*,', '",\n', filecont) | 
|---|
| 730 |     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) | 
|---|
| 731 |     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', | 
|---|
| 732 |                           '\n\n\g<1>\g<2>,\n', filecont) | 
|---|
| 733 |  | 
|---|
| 734 |     # add new lines after } | 
|---|
| 735 |     filecont = re.sub('"\s*}','"\n}\n', filecont) | 
|---|
| 736 |     filecont = re.sub('}\s*,','},\n', filecont) | 
|---|
| 737 |  | 
|---|
| 738 |  | 
|---|
| 739 |     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) | 
|---|
| 740 |  | 
|---|
| 741 |     # character encoding, reserved latex characters | 
|---|
| 742 |     filecont = re.sub('{\\\&}', '&', filecont) | 
|---|
| 743 |     filecont = re.sub('\\\&', '&', filecont) | 
|---|
| 744 |  | 
|---|
| 745 |     # do checking for open braces to get format correct | 
|---|
| 746 |     open_brace_count = 0 | 
|---|
| 747 |     brace_split = re.split('([{}])',filecont) | 
|---|
| 748 |  | 
|---|
| 749 |     # rebuild filecont | 
|---|
| 750 |     filecont = '' | 
|---|
| 751 |  | 
|---|
| 752 |     for phrase in brace_split: | 
|---|
| 753 |         if phrase == '{': | 
|---|
| 754 |             open_brace_count = open_brace_count + 1 | 
|---|
| 755 |         elif phrase == '}': | 
|---|
| 756 |             open_brace_count = open_brace_count - 1 | 
|---|
| 757 |             if open_brace_count == 0: | 
|---|
| 758 |                 filecont = filecont + '\n' | 
|---|
| 759 |  | 
|---|
| 760 |         filecont = filecont + phrase | 
|---|
| 761 |  | 
|---|
| 762 |     filecont2 = bibtex_replace_abbreviations(filecont) | 
|---|
| 763 |  | 
|---|
| 764 |     # gather | 
|---|
| 765 |     filecont = filecont2.splitlines() | 
|---|
| 766 |     i=0 | 
|---|
| 767 |     j=0         # count the number of blank lines | 
|---|
| 768 |     for line in filecont: | 
|---|
| 769 |         # ignore blank lines | 
|---|
| 770 |         if line == '' or line == ' ': | 
|---|
| 771 |             j = j+1 | 
|---|
| 772 |             continue | 
|---|
| 773 |         filecont[i] = line + '\n' | 
|---|
| 774 |         i = i+1 | 
|---|
| 775 |  | 
|---|
| 776 |     # get rid of the extra stuff at the end of the array | 
|---|
| 777 |     # (The extra stuff are duplicates that are in the array because | 
|---|
| 778 |     # blank lines were removed.) | 
|---|
| 779 |     length = len( filecont) | 
|---|
| 780 |     filecont[length-j:length] = [] | 
|---|
| 781 |  | 
|---|
| 782 |     return filecont | 
|---|
| 783 |  | 
|---|
| 784 |  | 
|---|
| 785 | def filehandler(filepath): | 
|---|
| 786 |     try: | 
|---|
| 787 |         fd = open(filepath, 'r') | 
|---|
| 788 |         filecont_source = fd.readlines() | 
|---|
| 789 |         fd.close() | 
|---|
| 790 |     except: | 
|---|
| 791 |         print 'Could not open file:', filepath | 
|---|
| 792 |     washeddata = bibtexwasher(filecont_source) | 
|---|
| 793 |     outdata = bibtexdecoder(washeddata) | 
|---|
| 794 |     print '/**' | 
|---|
| 795 |     print '\page references References' | 
|---|
| 796 |     print | 
|---|
| 797 |     for line in outdata: | 
|---|
| 798 |         print line | 
|---|
| 799 |     print '*/' | 
|---|
| 800 |  | 
|---|
| 801 |  | 
|---|
| 802 | # main program | 
|---|
| 803 |  | 
|---|
| 804 | def main(): | 
|---|
| 805 |     import sys | 
|---|
| 806 |     if sys.argv[1:]: | 
|---|
| 807 |         filepath = sys.argv[1] | 
|---|
| 808 |     else: | 
|---|
| 809 |         print "No input file" | 
|---|
| 810 |         sys.exit() | 
|---|
| 811 |     filehandler(filepath) | 
|---|
| 812 |  | 
|---|
| 813 | if __name__ == "__main__": main() | 
|---|
| 814 |  | 
|---|
| 815 |  | 
|---|
| 816 | # end python script | 
|---|