1 | #! /usr/bin/env python |
---|
2 | """ |
---|
3 | BibTeX to Doxygen converter |
---|
4 | Usage: python bib2dox.py bibfile.bib > bibfile.dox |
---|
5 | |
---|
6 | This file is a part of LEMON, a generic C++ optimization library. |
---|
7 | |
---|
8 | ********************************************************************** |
---|
9 | |
---|
10 | This code is the modification of the BibTeX to XML converter |
---|
11 | by Vidar Bronken Gundersen et al. |
---|
12 | See the original copyright notices below. |
---|
13 | |
---|
14 | ********************************************************************** |
---|
15 | |
---|
16 | Decoder for bibliographic data, BibTeX |
---|
17 | Usage: python bibtex2xml.py bibfile.bib > bibfile.xml |
---|
18 | |
---|
19 | v.8 |
---|
20 | (c)2002-06-23 Vidar Bronken Gundersen |
---|
21 | http://bibtexml.sf.net/ |
---|
22 | Reuse approved as long as this notification is kept. |
---|
23 | Licence: GPL. |
---|
24 | |
---|
25 | Contributions/thanks to: |
---|
26 | Egon Willighagen, http://sf.net/projects/jreferences/ |
---|
27 | Richard Mahoney (for providing a test case) |
---|
28 | |
---|
29 | Editted by Sara Sprenkle to be more robust and handle more bibtex features. |
---|
30 | (c) 2003-01-15 |
---|
31 | |
---|
32 | 1. Changed bibtex: tags to bibxml: tags. |
---|
33 | 2. Use xmlns:bibxml="http://bibtexml.sf.net/" |
---|
34 | 3. Allow spaces between @type and first { |
---|
35 | 4. "author" fields with multiple authors split by " and " |
---|
36 | are put in separate xml "bibxml:author" tags. |
---|
37 | 5. Option for Titles: words are capitalized |
---|
38 | only if first letter in title or capitalized inside braces |
---|
39 | 6. Removes braces from within field values |
---|
40 | 7. Ignores comments in bibtex file (including @comment{ or % ) |
---|
41 | 8. Replaces some special latex tags, e.g., replaces ~ with ' ' |
---|
42 | 9. Handles bibtex @string abbreviations |
---|
43 | --> includes bibtex's default abbreviations for months |
---|
44 | --> does concatenation of abbr # " more " and " more " # abbr |
---|
45 | 10. Handles @type( ... ) or @type{ ... } |
---|
46 | 11. The keywords field is split on , or ; and put into separate xml |
---|
47 | "bibxml:keywords" tags |
---|
48 | 12. Ignores @preamble |
---|
49 | |
---|
50 | Known Limitations |
---|
51 | 1. Does not transform Latex encoding like math mode and special |
---|
52 | latex symbols. |
---|
53 | 2. Does not parse author fields into first and last names. |
---|
54 | E.g., It does not do anything special to an author whose name is |
---|
55 | in the form LAST_NAME, FIRST_NAME |
---|
56 | In "author" tag, will show up as |
---|
57 | <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author> |
---|
58 | 3. Does not handle "crossref" fields other than to print |
---|
59 | <bibxml:crossref>...</bibxml:crossref> |
---|
60 | 4. Does not inform user of the input's format errors. You just won't |
---|
61 | be able to transform the file later with XSL |
---|
62 | |
---|
63 | You will have to manually edit the XML output if you need to handle |
---|
64 | these (and unknown) limitations. |
---|
65 | |
---|
66 | """ |
---|
67 | |
---|
68 | import string, re |
---|
69 | |
---|
70 | # set of valid name characters |
---|
71 | valid_name_chars = '[\w\-:]' |
---|
72 | |
---|
73 | # |
---|
74 | # define global regular expression variables |
---|
75 | # |
---|
76 | author_rex = re.compile('\s+and\s+') |
---|
77 | rembraces_rex = re.compile('[{}]') |
---|
78 | capitalize_rex = re.compile('({[^}]*})') |
---|
79 | |
---|
80 | # used by bibtexkeywords(data) |
---|
81 | keywords_rex = re.compile('[,;]') |
---|
82 | |
---|
83 | # used by concat_line(line) |
---|
84 | concatsplit_rex = re.compile('\s*#\s*') |
---|
85 | |
---|
86 | # split on {, }, or " in verify_out_of_braces |
---|
87 | delimiter_rex = re.compile('([{}"])',re.I) |
---|
88 | |
---|
89 | field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
---|
90 | data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') |
---|
91 | |
---|
92 | url_rex = re.compile('\\\url\{([^}]*)\}') |
---|
93 | |
---|
94 | # |
---|
95 | # styles for html formatting |
---|
96 | # |
---|
97 | divstyle = 'margin-top: -4ex; margin-left: 10em;' |
---|
98 | |
---|
99 | # |
---|
100 | # return the string parameter without braces |
---|
101 | # |
---|
102 | def transformurls(str): |
---|
103 | return url_rex.sub(r'<a href="\1">\1</a>', str) |
---|
104 | |
---|
105 | # |
---|
106 | # return the string parameter without braces |
---|
107 | # |
---|
108 | def removebraces(str): |
---|
109 | return rembraces_rex.sub('', str) |
---|
110 | |
---|
111 | # |
---|
112 | # latex-specific replacements |
---|
113 | # (do this after braces were removed) |
---|
114 | # |
---|
115 | def latexreplacements(line): |
---|
116 | line = string.replace(line, '~', ' ') |
---|
117 | line = string.replace(line, '\\\'a', 'á') |
---|
118 | line = string.replace(line, '\\"a', 'ä') |
---|
119 | line = string.replace(line, '\\\'e', 'é') |
---|
120 | line = string.replace(line, '\\"e', 'ë') |
---|
121 | line = string.replace(line, '\\\'i', 'í') |
---|
122 | line = string.replace(line, '\\"i', 'ï') |
---|
123 | line = string.replace(line, '\\\'o', 'ó') |
---|
124 | line = string.replace(line, '\\"o', 'ö') |
---|
125 | line = string.replace(line, '\\\'u', 'ú') |
---|
126 | line = string.replace(line, '\\"u', 'ü') |
---|
127 | line = string.replace(line, '\\H o', 'õ') |
---|
128 | line = string.replace(line, '\\H u', 'ü') # ũ does not exist |
---|
129 | line = string.replace(line, '\\\'A', 'Á') |
---|
130 | line = string.replace(line, '\\"A', 'Ä') |
---|
131 | line = string.replace(line, '\\\'E', 'É') |
---|
132 | line = string.replace(line, '\\"E', 'Ë') |
---|
133 | line = string.replace(line, '\\\'I', 'Í') |
---|
134 | line = string.replace(line, '\\"I', 'Ï') |
---|
135 | line = string.replace(line, '\\\'O', 'Ó') |
---|
136 | line = string.replace(line, '\\"O', 'Ö') |
---|
137 | line = string.replace(line, '\\\'U', 'Ú') |
---|
138 | line = string.replace(line, '\\"U', 'Ü') |
---|
139 | line = string.replace(line, '\\H O', 'Õ') |
---|
140 | line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist |
---|
141 | |
---|
142 | return line |
---|
143 | |
---|
144 | # |
---|
145 | # copy characters form a string decoding html expressions (&xyz;) |
---|
146 | # |
---|
147 | def copychars(str, ifrom, count): |
---|
148 | result = '' |
---|
149 | i = ifrom |
---|
150 | c = 0 |
---|
151 | html_spec = False |
---|
152 | while (i < len(str)) and (c < count): |
---|
153 | if str[i] == '&': |
---|
154 | html_spec = True; |
---|
155 | if i+1 < len(str): |
---|
156 | result += str[i+1] |
---|
157 | c += 1 |
---|
158 | i += 2 |
---|
159 | else: |
---|
160 | if not html_spec: |
---|
161 | if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ |
---|
162 | ((str[i] >= 'a') and (str[i] <= 'z')): |
---|
163 | result += str[i] |
---|
164 | c += 1 |
---|
165 | elif str[i] == ';': |
---|
166 | html_spec = False; |
---|
167 | i += 1 |
---|
168 | |
---|
169 | return result |
---|
170 | |
---|
171 | |
---|
172 | # |
---|
173 | # Handle a list of authors (separated by 'and'). |
---|
174 | # It gives back an array of the follwing values: |
---|
175 | # - num: the number of authors, |
---|
176 | # - list: the list of the author names, |
---|
177 | # - text: the bibtex text (separated by commas and/or 'and') |
---|
178 | # - abbrev: abbreviation that can be used for indicate the |
---|
179 | # bibliography entries |
---|
180 | # |
---|
181 | def bibtexauthor(data): |
---|
182 | result = {} |
---|
183 | bibtex = '' |
---|
184 | result['list'] = author_rex.split(data) |
---|
185 | result['num'] = len(result['list']) |
---|
186 | for i, author in enumerate(result['list']): |
---|
187 | # general transformations |
---|
188 | author = latexreplacements(removebraces(author.strip())) |
---|
189 | # transform "Xyz, A. B." to "A. B. Xyz" |
---|
190 | pos = author.find(',') |
---|
191 | if pos != -1: |
---|
192 | author = author[pos+1:].strip() + ' ' + author[:pos].strip() |
---|
193 | result['list'][i] = author |
---|
194 | bibtex += author + '#' |
---|
195 | bibtex = bibtex[:-1] |
---|
196 | if result['num'] > 1: |
---|
197 | ix = bibtex.rfind('#') |
---|
198 | if result['num'] == 2: |
---|
199 | bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] |
---|
200 | else: |
---|
201 | bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] |
---|
202 | bibtex = bibtex.replace('#', ', ') |
---|
203 | result['text'] = bibtex |
---|
204 | |
---|
205 | result['abbrev'] = '' |
---|
206 | for author in result['list']: |
---|
207 | pos = author.rfind(' ') + 1 |
---|
208 | count = 1 |
---|
209 | if result['num'] == 1: |
---|
210 | count = 3 |
---|
211 | result['abbrev'] += copychars(author, pos, count) |
---|
212 | |
---|
213 | return result |
---|
214 | |
---|
215 | |
---|
216 | # |
---|
217 | # data = title string |
---|
218 | # @return the capitalized title (first letter is capitalized), rest are capitalized |
---|
219 | # only if capitalized inside braces |
---|
220 | # |
---|
221 | def capitalizetitle(data): |
---|
222 | title_list = capitalize_rex.split(data) |
---|
223 | title = '' |
---|
224 | count = 0 |
---|
225 | for phrase in title_list: |
---|
226 | check = string.lstrip(phrase) |
---|
227 | |
---|
228 | # keep phrase's capitalization the same |
---|
229 | if check.find('{') == 0: |
---|
230 | title += removebraces(phrase) |
---|
231 | else: |
---|
232 | # first word --> capitalize first letter (after spaces) |
---|
233 | if count == 0: |
---|
234 | title += check.capitalize() |
---|
235 | else: |
---|
236 | title += phrase.lower() |
---|
237 | count = count + 1 |
---|
238 | |
---|
239 | return title |
---|
240 | |
---|
241 | |
---|
242 | # |
---|
243 | # @return the bibtex for the title |
---|
244 | # @param data --> title string |
---|
245 | # braces are removed from title |
---|
246 | # |
---|
247 | def bibtextitle(data, entrytype): |
---|
248 | if entrytype in ('book', 'inbook'): |
---|
249 | title = removebraces(data.strip()) |
---|
250 | else: |
---|
251 | title = removebraces(capitalizetitle(data.strip())) |
---|
252 | bibtex = title |
---|
253 | return bibtex |
---|
254 | |
---|
255 | |
---|
256 | # |
---|
257 | # function to compare entry lists |
---|
258 | # |
---|
259 | def entry_cmp(x, y): |
---|
260 | return cmp(x[0], y[0]) |
---|
261 | |
---|
262 | |
---|
263 | # |
---|
264 | # print the XML for the transformed "filecont_source" |
---|
265 | # |
---|
266 | def bibtexdecoder(filecont_source): |
---|
267 | filecont = [] |
---|
268 | file = [] |
---|
269 | |
---|
270 | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
---|
271 | pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') |
---|
272 | endtype_rex = re.compile('}\s*$') |
---|
273 | endtag_rex = re.compile('^\s*}\s*$') |
---|
274 | |
---|
275 | bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
---|
276 | bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') |
---|
277 | |
---|
278 | quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
---|
279 | quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') |
---|
280 | |
---|
281 | for line in filecont_source: |
---|
282 | line = line[:-1] |
---|
283 | |
---|
284 | # encode character entities |
---|
285 | line = string.replace(line, '&', '&') |
---|
286 | line = string.replace(line, '<', '<') |
---|
287 | line = string.replace(line, '>', '>') |
---|
288 | |
---|
289 | # start entry: publication type (store for later use) |
---|
290 | if pubtype_rex.match(line): |
---|
291 | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
---|
292 | entrycont = {} |
---|
293 | entry = [] |
---|
294 | entrytype = pubtype_rex.sub('\g<1>',line) |
---|
295 | entrytype = string.lower(entrytype) |
---|
296 | entryid = pubtype_rex.sub('\g<2>', line) |
---|
297 | |
---|
298 | # end entry if just a } |
---|
299 | elif endtype_rex.match(line): |
---|
300 | # generate doxygen code for the entry |
---|
301 | |
---|
302 | # enty type related formattings |
---|
303 | if entrytype in ('book', 'inbook'): |
---|
304 | entrycont['title'] = '<em>' + entrycont['title'] + '</em>' |
---|
305 | if not entrycont.has_key('author'): |
---|
306 | entrycont['author'] = entrycont['editor'] |
---|
307 | entrycont['author']['text'] += ', editors' |
---|
308 | elif entrytype == 'article': |
---|
309 | entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>' |
---|
310 | elif entrytype in ('inproceedings', 'incollection', 'conference'): |
---|
311 | entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>' |
---|
312 | elif entrytype == 'techreport': |
---|
313 | if not entrycont.has_key('type'): |
---|
314 | entrycont['type'] = 'Technical report' |
---|
315 | elif entrytype == 'mastersthesis': |
---|
316 | entrycont['type'] = 'Master\'s thesis' |
---|
317 | elif entrytype == 'phdthesis': |
---|
318 | entrycont['type'] = 'PhD thesis' |
---|
319 | |
---|
320 | for eline in entrycont: |
---|
321 | if eline != '': |
---|
322 | eline = latexreplacements(eline) |
---|
323 | |
---|
324 | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
---|
325 | entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') |
---|
326 | |
---|
327 | if entrycont.has_key('author') and (entrycont['author'] != ''): |
---|
328 | entry.append(entrycont['author']['text'] + '.') |
---|
329 | if entrycont.has_key('title') and (entrycont['title'] != ''): |
---|
330 | entry.append(entrycont['title'] + '.') |
---|
331 | if entrycont.has_key('journal') and (entrycont['journal'] != ''): |
---|
332 | entry.append(entrycont['journal'] + ',') |
---|
333 | if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): |
---|
334 | entry.append('In ' + entrycont['booktitle'] + ',') |
---|
335 | if entrycont.has_key('type') and (entrycont['type'] != ''): |
---|
336 | eline = entrycont['type'] |
---|
337 | if entrycont.has_key('number') and (entrycont['number'] != ''): |
---|
338 | eline += ' ' + entrycont['number'] |
---|
339 | eline += ',' |
---|
340 | entry.append(eline) |
---|
341 | if entrycont.has_key('institution') and (entrycont['institution'] != ''): |
---|
342 | entry.append(entrycont['institution'] + ',') |
---|
343 | if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): |
---|
344 | entry.append(entrycont['publisher'] + ',') |
---|
345 | if entrycont.has_key('school') and (entrycont['school'] != ''): |
---|
346 | entry.append(entrycont['school'] + ',') |
---|
347 | if entrycont.has_key('address') and (entrycont['address'] != ''): |
---|
348 | entry.append(entrycont['address'] + ',') |
---|
349 | if entrycont.has_key('edition') and (entrycont['edition'] != ''): |
---|
350 | entry.append(entrycont['edition'] + ' edition,') |
---|
351 | if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): |
---|
352 | entry.append(entrycont['howpublished'] + ',') |
---|
353 | if entrycont.has_key('volume') and (entrycont['volume'] != ''): |
---|
354 | eline = entrycont['volume']; |
---|
355 | if entrycont.has_key('number') and (entrycont['number'] != ''): |
---|
356 | eline += '(' + entrycont['number'] + ')' |
---|
357 | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
---|
358 | eline += ':' + entrycont['pages'] |
---|
359 | eline += ',' |
---|
360 | entry.append(eline) |
---|
361 | else: |
---|
362 | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
---|
363 | entry.append('pages ' + entrycont['pages'] + ',') |
---|
364 | if entrycont.has_key('year') and (entrycont['year'] != ''): |
---|
365 | if entrycont.has_key('month') and (entrycont['month'] != ''): |
---|
366 | entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') |
---|
367 | else: |
---|
368 | entry.append(entrycont['year'] + '.') |
---|
369 | if entrycont.has_key('note') and (entrycont['note'] != ''): |
---|
370 | entry.append(entrycont['note'] + '.') |
---|
371 | if entrycont.has_key('url') and (entrycont['url'] != ''): |
---|
372 | entry.append(entrycont['url'] + '.') |
---|
373 | |
---|
374 | # generate keys for sorting and for the output |
---|
375 | sortkey = '' |
---|
376 | bibkey = '' |
---|
377 | if entrycont.has_key('author'): |
---|
378 | for author in entrycont['author']['list']: |
---|
379 | sortkey += copychars(author, author.rfind(' ')+1, len(author)) |
---|
380 | bibkey = entrycont['author']['abbrev'] |
---|
381 | else: |
---|
382 | bibkey = 'x' |
---|
383 | if entrycont.has_key('year'): |
---|
384 | sortkey += entrycont['year'] |
---|
385 | bibkey += entrycont['year'][-2:] |
---|
386 | if entrycont.has_key('title'): |
---|
387 | sortkey += entrycont['title'] |
---|
388 | if entrycont.has_key('key'): |
---|
389 | sortkey = entrycont['key'] + sortkey |
---|
390 | bibkey = entrycont['key'] |
---|
391 | entry.insert(0, sortkey) |
---|
392 | entry.insert(1, bibkey) |
---|
393 | entry.insert(2, entryid) |
---|
394 | |
---|
395 | # add the entry to the file contents |
---|
396 | filecont.append(entry) |
---|
397 | |
---|
398 | else: |
---|
399 | # field, publication info |
---|
400 | field = '' |
---|
401 | data = '' |
---|
402 | |
---|
403 | # field = {data} entries |
---|
404 | if bracedata_rex.match(line): |
---|
405 | field = bracefield_rex.sub('\g<1>', line) |
---|
406 | field = string.lower(field) |
---|
407 | data = bracedata_rex.sub('\g<2>', line) |
---|
408 | |
---|
409 | # field = "data" entries |
---|
410 | elif quotedata_rex.match(line): |
---|
411 | field = quotefield_rex.sub('\g<1>', line) |
---|
412 | field = string.lower(field) |
---|
413 | data = quotedata_rex.sub('\g<2>', line) |
---|
414 | |
---|
415 | # field = data entries |
---|
416 | elif data_rex.match(line): |
---|
417 | field = field_rex.sub('\g<1>', line) |
---|
418 | field = string.lower(field) |
---|
419 | data = data_rex.sub('\g<2>', line) |
---|
420 | |
---|
421 | if field == 'url': |
---|
422 | data = '\\url{' + data.strip() + '}' |
---|
423 | |
---|
424 | if field in ('author', 'editor'): |
---|
425 | entrycont[field] = bibtexauthor(data) |
---|
426 | line = '' |
---|
427 | elif field == 'title': |
---|
428 | line = bibtextitle(data, entrytype) |
---|
429 | elif field != '': |
---|
430 | line = removebraces(transformurls(data.strip())) |
---|
431 | |
---|
432 | if line != '': |
---|
433 | line = latexreplacements(line) |
---|
434 | entrycont[field] = line |
---|
435 | |
---|
436 | |
---|
437 | # sort entries |
---|
438 | filecont.sort(entry_cmp) |
---|
439 | |
---|
440 | # count the bibtex keys |
---|
441 | keytable = {} |
---|
442 | counttable = {} |
---|
443 | for entry in filecont: |
---|
444 | bibkey = entry[1] |
---|
445 | if not keytable.has_key(bibkey): |
---|
446 | keytable[bibkey] = 1 |
---|
447 | else: |
---|
448 | keytable[bibkey] += 1 |
---|
449 | |
---|
450 | for bibkey in keytable.keys(): |
---|
451 | counttable[bibkey] = 0 |
---|
452 | |
---|
453 | # generate output |
---|
454 | for entry in filecont: |
---|
455 | # generate output key form the bibtex key |
---|
456 | bibkey = entry[1] |
---|
457 | entryid = entry[2] |
---|
458 | if keytable[bibkey] == 1: |
---|
459 | outkey = bibkey |
---|
460 | else: |
---|
461 | outkey = bibkey + chr(97 + counttable[bibkey]) |
---|
462 | counttable[bibkey] += 1 |
---|
463 | |
---|
464 | # append the entry code to the output |
---|
465 | file.append('\\section ' + entryid + ' [' + outkey + ']') |
---|
466 | file.append('<div style="' + divstyle + '">') |
---|
467 | for line in entry[3:]: |
---|
468 | file.append(line) |
---|
469 | file.append('</div>') |
---|
470 | file.append('') |
---|
471 | |
---|
472 | return file |
---|
473 | |
---|
474 | |
---|
475 | # |
---|
476 | # return 1 iff abbr is in line but not inside braces or quotes |
---|
477 | # assumes that abbr appears only once on the line (out of braces and quotes) |
---|
478 | # |
---|
479 | def verify_out_of_braces(line, abbr): |
---|
480 | |
---|
481 | phrase_split = delimiter_rex.split(line) |
---|
482 | |
---|
483 | abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) |
---|
484 | |
---|
485 | open_brace = 0 |
---|
486 | open_quote = 0 |
---|
487 | |
---|
488 | for phrase in phrase_split: |
---|
489 | if phrase == "{": |
---|
490 | open_brace = open_brace + 1 |
---|
491 | elif phrase == "}": |
---|
492 | open_brace = open_brace - 1 |
---|
493 | elif phrase == '"': |
---|
494 | if open_quote == 1: |
---|
495 | open_quote = 0 |
---|
496 | else: |
---|
497 | open_quote = 1 |
---|
498 | elif abbr_rex.search(phrase): |
---|
499 | if open_brace == 0 and open_quote == 0: |
---|
500 | return 1 |
---|
501 | |
---|
502 | return 0 |
---|
503 | |
---|
504 | |
---|
505 | # |
---|
506 | # a line in the form phrase1 # phrase2 # ... # phrasen |
---|
507 | # is returned as phrase1 phrase2 ... phrasen |
---|
508 | # with the correct punctuation |
---|
509 | # Bug: Doesn't always work with multiple abbreviations plugged in |
---|
510 | # |
---|
511 | def concat_line(line): |
---|
512 | # only look at part after equals |
---|
513 | field = field_rex.sub('\g<1>',line) |
---|
514 | rest = field_rex.sub('\g<2>',line) |
---|
515 | |
---|
516 | concat_line = field + ' =' |
---|
517 | |
---|
518 | pound_split = concatsplit_rex.split(rest) |
---|
519 | |
---|
520 | phrase_count = 0 |
---|
521 | length = len(pound_split) |
---|
522 | |
---|
523 | for phrase in pound_split: |
---|
524 | phrase = phrase.strip() |
---|
525 | if phrase_count != 0: |
---|
526 | if phrase.startswith('"') or phrase.startswith('{'): |
---|
527 | phrase = phrase[1:] |
---|
528 | elif phrase.startswith('"'): |
---|
529 | phrase = phrase.replace('"','{',1) |
---|
530 | |
---|
531 | if phrase_count != length-1: |
---|
532 | if phrase.endswith('"') or phrase.endswith('}'): |
---|
533 | phrase = phrase[:-1] |
---|
534 | else: |
---|
535 | if phrase.endswith('"'): |
---|
536 | phrase = phrase[:-1] |
---|
537 | phrase = phrase + "}" |
---|
538 | elif phrase.endswith('",'): |
---|
539 | phrase = phrase[:-2] |
---|
540 | phrase = phrase + "}," |
---|
541 | |
---|
542 | # if phrase did have \#, add the \# back |
---|
543 | if phrase.endswith('\\'): |
---|
544 | phrase = phrase + "#" |
---|
545 | concat_line = concat_line + ' ' + phrase |
---|
546 | |
---|
547 | phrase_count = phrase_count + 1 |
---|
548 | |
---|
549 | return concat_line |
---|
550 | |
---|
551 | |
---|
552 | # |
---|
553 | # substitute abbreviations into filecont |
---|
554 | # @param filecont_source - string of data from file |
---|
555 | # |
---|
556 | def bibtex_replace_abbreviations(filecont_source): |
---|
557 | filecont = filecont_source.splitlines() |
---|
558 | |
---|
559 | # These are defined in bibtex, so we'll define them too |
---|
560 | abbr_list = ['jan','feb','mar','apr','may','jun', |
---|
561 | 'jul','aug','sep','oct','nov','dec'] |
---|
562 | value_list = ['January','February','March','April', |
---|
563 | 'May','June','July','August','September', |
---|
564 | 'October','November','December'] |
---|
565 | |
---|
566 | abbr_rex = [] |
---|
567 | total_abbr_count = 0 |
---|
568 | |
---|
569 | front = '\\b' |
---|
570 | back = '(,?)\\b' |
---|
571 | |
---|
572 | for x in abbr_list: |
---|
573 | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
---|
574 | total_abbr_count = total_abbr_count + 1 |
---|
575 | |
---|
576 | |
---|
577 | abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', |
---|
578 | re.I) |
---|
579 | |
---|
580 | comment_rex = re.compile('@comment\s*{',re.I) |
---|
581 | preamble_rex = re.compile('@preamble\s*{',re.I) |
---|
582 | |
---|
583 | waiting_for_end_string = 0 |
---|
584 | i = 0 |
---|
585 | filecont2 = '' |
---|
586 | |
---|
587 | for line in filecont: |
---|
588 | if line == ' ' or line == '': |
---|
589 | continue |
---|
590 | |
---|
591 | if waiting_for_end_string: |
---|
592 | if re.search('}',line): |
---|
593 | waiting_for_end_string = 0 |
---|
594 | continue |
---|
595 | |
---|
596 | if abbrdef_rex.search(line): |
---|
597 | abbr = abbrdef_rex.sub('\g<1>', line) |
---|
598 | |
---|
599 | if abbr_list.count(abbr) == 0: |
---|
600 | val = abbrdef_rex.sub('\g<2>', line) |
---|
601 | abbr_list.append(abbr) |
---|
602 | value_list.append(string.strip(val)) |
---|
603 | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
---|
604 | total_abbr_count = total_abbr_count + 1 |
---|
605 | waiting_for_end_string = 1 |
---|
606 | continue |
---|
607 | |
---|
608 | if comment_rex.search(line): |
---|
609 | waiting_for_end_string = 1 |
---|
610 | continue |
---|
611 | |
---|
612 | if preamble_rex.search(line): |
---|
613 | waiting_for_end_string = 1 |
---|
614 | continue |
---|
615 | |
---|
616 | |
---|
617 | # replace subsequent abbreviations with the value |
---|
618 | abbr_count = 0 |
---|
619 | |
---|
620 | for x in abbr_list: |
---|
621 | |
---|
622 | if abbr_rex[abbr_count].search(line): |
---|
623 | if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: |
---|
624 | line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) |
---|
625 | # Check for # concatenations |
---|
626 | if concatsplit_rex.search(line): |
---|
627 | line = concat_line(line) |
---|
628 | abbr_count = abbr_count + 1 |
---|
629 | |
---|
630 | |
---|
631 | filecont2 = filecont2 + line + '\n' |
---|
632 | i = i+1 |
---|
633 | |
---|
634 | |
---|
635 | # Do one final pass over file |
---|
636 | |
---|
637 | # make sure that didn't end up with {" or }" after the substitution |
---|
638 | filecont2 = filecont2.replace('{"','{{') |
---|
639 | filecont2 = filecont2.replace('"}','}}') |
---|
640 | |
---|
641 | afterquotevalue_rex = re.compile('"\s*,\s*') |
---|
642 | afterbrace_rex = re.compile('"\s*}') |
---|
643 | afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') |
---|
644 | |
---|
645 | # add new lines to data that changed because of abbreviation substitutions |
---|
646 | filecont2 = afterquotevalue_rex.sub('",\n', filecont2) |
---|
647 | filecont2 = afterbrace_rex.sub('"\n}', filecont2) |
---|
648 | filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) |
---|
649 | |
---|
650 | return filecont2 |
---|
651 | |
---|
652 | # |
---|
653 | # convert @type( ... ) to @type{ ... } |
---|
654 | # |
---|
655 | def no_outer_parens(filecont): |
---|
656 | |
---|
657 | # do checking for open parens |
---|
658 | # will convert to braces |
---|
659 | paren_split = re.split('([(){}])',filecont) |
---|
660 | |
---|
661 | open_paren_count = 0 |
---|
662 | open_type = 0 |
---|
663 | look_next = 0 |
---|
664 | |
---|
665 | # rebuild filecont |
---|
666 | filecont = '' |
---|
667 | |
---|
668 | at_rex = re.compile('@\w*') |
---|
669 | |
---|
670 | for phrase in paren_split: |
---|
671 | if look_next == 1: |
---|
672 | if phrase == '(': |
---|
673 | phrase = '{' |
---|
674 | open_paren_count = open_paren_count + 1 |
---|
675 | else: |
---|
676 | open_type = 0 |
---|
677 | look_next = 0 |
---|
678 | |
---|
679 | if phrase == '(': |
---|
680 | open_paren_count = open_paren_count + 1 |
---|
681 | |
---|
682 | elif phrase == ')': |
---|
683 | open_paren_count = open_paren_count - 1 |
---|
684 | if open_type == 1 and open_paren_count == 0: |
---|
685 | phrase = '}' |
---|
686 | open_type = 0 |
---|
687 | |
---|
688 | elif at_rex.search( phrase ): |
---|
689 | open_type = 1 |
---|
690 | look_next = 1 |
---|
691 | |
---|
692 | filecont = filecont + phrase |
---|
693 | |
---|
694 | return filecont |
---|
695 | |
---|
696 | |
---|
697 | # |
---|
698 | # make all whitespace into just one space |
---|
699 | # format the bibtex file into a usable form. |
---|
700 | # |
---|
701 | def bibtexwasher(filecont_source): |
---|
702 | |
---|
703 | space_rex = re.compile('\s+') |
---|
704 | comment_rex = re.compile('\s*%') |
---|
705 | |
---|
706 | filecont = [] |
---|
707 | |
---|
708 | # remove trailing and excessive whitespace |
---|
709 | # ignore comments |
---|
710 | for line in filecont_source: |
---|
711 | line = string.strip(line) |
---|
712 | line = space_rex.sub(' ', line) |
---|
713 | # ignore comments |
---|
714 | if not comment_rex.match(line) and line != '': |
---|
715 | filecont.append(' '+ line) |
---|
716 | |
---|
717 | filecont = string.join(filecont, '') |
---|
718 | |
---|
719 | # the file is in one long string |
---|
720 | |
---|
721 | filecont = no_outer_parens(filecont) |
---|
722 | |
---|
723 | # |
---|
724 | # split lines according to preferred syntax scheme |
---|
725 | # |
---|
726 | filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) |
---|
727 | |
---|
728 | # add new lines after commas that are after values |
---|
729 | filecont = re.sub('"\s*,', '",\n', filecont) |
---|
730 | filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) |
---|
731 | filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', |
---|
732 | '\n\n\g<1>\g<2>,\n', filecont) |
---|
733 | |
---|
734 | # add new lines after } |
---|
735 | filecont = re.sub('"\s*}','"\n}\n', filecont) |
---|
736 | filecont = re.sub('}\s*,','},\n', filecont) |
---|
737 | |
---|
738 | |
---|
739 | filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) |
---|
740 | |
---|
741 | # character encoding, reserved latex characters |
---|
742 | filecont = re.sub('{\\\&}', '&', filecont) |
---|
743 | filecont = re.sub('\\\&', '&', filecont) |
---|
744 | |
---|
745 | # do checking for open braces to get format correct |
---|
746 | open_brace_count = 0 |
---|
747 | brace_split = re.split('([{}])',filecont) |
---|
748 | |
---|
749 | # rebuild filecont |
---|
750 | filecont = '' |
---|
751 | |
---|
752 | for phrase in brace_split: |
---|
753 | if phrase == '{': |
---|
754 | open_brace_count = open_brace_count + 1 |
---|
755 | elif phrase == '}': |
---|
756 | open_brace_count = open_brace_count - 1 |
---|
757 | if open_brace_count == 0: |
---|
758 | filecont = filecont + '\n' |
---|
759 | |
---|
760 | filecont = filecont + phrase |
---|
761 | |
---|
762 | filecont2 = bibtex_replace_abbreviations(filecont) |
---|
763 | |
---|
764 | # gather |
---|
765 | filecont = filecont2.splitlines() |
---|
766 | i=0 |
---|
767 | j=0 # count the number of blank lines |
---|
768 | for line in filecont: |
---|
769 | # ignore blank lines |
---|
770 | if line == '' or line == ' ': |
---|
771 | j = j+1 |
---|
772 | continue |
---|
773 | filecont[i] = line + '\n' |
---|
774 | i = i+1 |
---|
775 | |
---|
776 | # get rid of the extra stuff at the end of the array |
---|
777 | # (The extra stuff are duplicates that are in the array because |
---|
778 | # blank lines were removed.) |
---|
779 | length = len( filecont) |
---|
780 | filecont[length-j:length] = [] |
---|
781 | |
---|
782 | return filecont |
---|
783 | |
---|
784 | |
---|
785 | def filehandler(filepath): |
---|
786 | try: |
---|
787 | fd = open(filepath, 'r') |
---|
788 | filecont_source = fd.readlines() |
---|
789 | fd.close() |
---|
790 | except: |
---|
791 | print 'Could not open file:', filepath |
---|
792 | washeddata = bibtexwasher(filecont_source) |
---|
793 | outdata = bibtexdecoder(washeddata) |
---|
794 | print '/**' |
---|
795 | print '\page references References' |
---|
796 | print |
---|
797 | for line in outdata: |
---|
798 | print line |
---|
799 | print '*/' |
---|
800 | |
---|
801 | |
---|
802 | # main program |
---|
803 | |
---|
804 | def main(): |
---|
805 | import sys |
---|
806 | if sys.argv[1:]: |
---|
807 | filepath = sys.argv[1] |
---|
808 | else: |
---|
809 | print "No input file" |
---|
810 | sys.exit() |
---|
811 | filehandler(filepath) |
---|
812 | |
---|
813 | if __name__ == "__main__": main() |
---|
814 | |
---|
815 | |
---|
816 | # end python script |
---|