1 #!/usr/bin/env /usr/local/Python/bin/python2.1
3 BibTeX to Doxygen converter
4 Usage: python bib2dox.py bibfile.bib > bibfile.dox
6 This code is the modification of the BibTeX to XML converter
7 by Vidar Bronken Gundersen et al. See the original copyright notices below.
9 **********************************************************************
11 Decoder for bibliographic data, BibTeX
12 Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
15 (c)2002-06-23 Vidar Bronken Gundersen
16 http://bibtexml.sf.net/
17 Reuse approved as long as this notification is kept.
20 Contributions/thanks to:
21 Egon Willighagen, http://sf.net/projects/jreferences/
22 Richard Mahoney (for providing a test case)
24 Editted by Sara Sprenkle to be more robust and handle more bibtex features.
27 1. Changed bibtex: tags to bibxml: tags.
28 2. Use xmlns:bibxml="http://bibtexml.sf.net/"
29 3. Allow spaces between @type and first {
30 4. "author" fields with multiple authors split by " and "
31 are put in separate xml "bibxml:author" tags.
32 5. Option for Titles: words are capitalized
33 only if first letter in title or capitalized inside braces
34 6. Removes braces from within field values
35 7. Ignores comments in bibtex file (including @comment{ or % )
36 8. Replaces some special latex tags, e.g., replaces ~ with ' '
37 9. Handles bibtex @string abbreviations
38 --> includes bibtex's default abbreviations for months
39 --> does concatenation of abbr # " more " and " more " # abbr
40 10. Handles @type( ... ) or @type{ ... }
41 11. The keywords field is split on , or ; and put into separate xml
42 "bibxml:keywords" tags
46 1. Does not transform Latex encoding like math mode and special
48 2. Does not parse author fields into first and last names.
49 E.g., It does not do anything special to an author whose name is
50 in the form LAST_NAME, FIRST_NAME
51 In "author" tag, will show up as
52 <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
53 3. Does not handle "crossref" fields other than to print
54 <bibxml:crossref>...</bibxml:crossref>
55 4. Does not inform user of the input's format errors. You just won't
56 be able to transform the file later with XSL
58 You will have to manually edit the XML output if you need to handle
59 these (and unknown) limitations.
65 # set of valid name characters
66 valid_name_chars = '[\w\-:]'
69 # define global regular expression variables
71 author_rex = re.compile('\s+and\s+')
72 rembraces_rex = re.compile('[{}]')
73 capitalize_rex = re.compile('({\w*})')
75 # used by bibtexkeywords(data)
76 keywords_rex = re.compile('[,;]')
78 # used by concat_line(line)
79 concatsplit_rex = re.compile('\s*#\s*')
81 # split on {, }, or " in verify_out_of_braces
82 delimiter_rex = re.compile('([{}"])',re.I)
84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
87 url_rex = re.compile('\\\url\{([^}]*)\}')
91 # return the string parameter without braces
93 def transformurls(str):
94 return url_rex.sub(r'<a href="\1">\1</a>', str)
97 # return the string parameter without braces
99 def removebraces(str):
100 return rembraces_rex.sub('', str)
103 # latex-specific replacements
104 # (do this after braces were removed)
106 def latexreplacements(line):
107 line = string.replace(line, '~', ' ')
108 line = string.replace(line, '\\\'a', 'á')
109 line = string.replace(line, '\\"a', 'ä')
110 line = string.replace(line, '\\\'e', 'é')
111 line = string.replace(line, '\\"e', 'ë')
112 line = string.replace(line, '\\\'i', 'í')
113 line = string.replace(line, '\\"i', 'ï')
114 line = string.replace(line, '\\\'o', 'ó')
115 line = string.replace(line, '\\"o', 'ö')
116 line = string.replace(line, '\\\'u', 'ú')
117 line = string.replace(line, '\\"u', 'ü')
118 line = string.replace(line, '\\H o', 'õ')
119 line = string.replace(line, '\\H u', 'ü') # ũ does not exist
120 line = string.replace(line, '\\\'A', 'Á')
121 line = string.replace(line, '\\"A', 'Ä')
122 line = string.replace(line, '\\\'E', 'É')
123 line = string.replace(line, '\\"E', 'Ë')
124 line = string.replace(line, '\\\'I', 'Í')
125 line = string.replace(line, '\\"I', 'Ï')
126 line = string.replace(line, '\\\'O', 'Ó')
127 line = string.replace(line, '\\"O', 'Ö')
128 line = string.replace(line, '\\\'U', 'Ú')
129 line = string.replace(line, '\\"U', 'Ü')
130 line = string.replace(line, '\\H O', 'Õ')
131 line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist
136 # copy characters form a string decoding html expressions (&xyz;)
138 def copychars(str, ifrom, count):
143 while (i < len(str)) and (c < count):
152 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
153 ((str[i] >= 'a') and (str[i] <= 'z')):
164 # Handle a list of authors (separated by 'and').
165 # It gives back an array of the follwing values:
166 # - num: the number of authors,
167 # - list: the list of the author names,
168 # - text: the bibtex text (separated by commas and/or 'and')
169 # - abbrev: abbreviation that can be used for indicate the
170 # bibliography entries
172 def bibtexauthor(data):
175 result['list'] = author_rex.split(data)
176 result['num'] = len(result['list'])
177 for i, author in enumerate(result['list']):
178 # general transformations
179 author = latexreplacements(removebraces(author.strip()))
180 # transform "Xyz, A. B." to "A. B. Xyz"
181 pos = author.find(',')
183 author = author[pos+1:].strip() + ' ' + author[:pos].strip()
184 result['list'][i] = author
185 bibtex += author + '#'
187 if result['num'] > 1:
188 ix = bibtex.rfind('#')
189 if result['num'] == 2:
190 bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
192 bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
193 bibtex = bibtex.replace('#', ', ')
194 result['text'] = bibtex
196 result['abbrev'] = ''
197 for author in result['list']:
198 pos = author.rfind(' ') + 1
200 if result['num'] == 1:
202 result['abbrev'] += copychars(author, pos, count)
208 # data = title string
209 # @return the capitalized title (first letter is capitalized), rest are capitalized
210 # only if capitalized inside braces
212 def capitalizetitle(data):
213 title_list = capitalize_rex.split(data)
216 for phrase in title_list:
217 check = string.lstrip(phrase)
219 # keep phrase's capitalization the same
220 if check.find('{') == 0:
221 title += removebraces(phrase)
223 # first word --> capitalize first letter (after spaces)
225 title += check.capitalize()
227 title += phrase.lower()
234 # @return the bibtex for the title
235 # @param data --> title string
236 # braces are removed from title
238 def bibtextitle(data, entrytype):
239 if entrytype in ('book', 'inbook'):
240 title = removebraces(data.strip())
242 title = removebraces(capitalizetitle(data.strip()))
248 # function to compare entry lists
251 return cmp(x[0], y[0])
255 # print the XML for the transformed "filecont_source"
257 def bibtexdecoder(filecont_source):
261 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
262 pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
263 endtype_rex = re.compile('}\s*$')
264 endtag_rex = re.compile('^\s*}\s*$')
266 bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
267 bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
269 quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
270 quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
272 for line in filecont_source:
275 # encode character entities
276 line = string.replace(line, '&', '&')
277 line = string.replace(line, '<', '<')
278 line = string.replace(line, '>', '>')
280 # start entry: publication type (store for later use)
281 if pubtype_rex.match(line):
282 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
285 entrytype = pubtype_rex.sub('\g<1>',line)
286 entrytype = string.lower(entrytype)
287 # entryid = pubtype_rex.sub('\g<2>', line)
289 # end entry if just a }
290 elif endtype_rex.match(line):
291 # generate doxygen code for the entry
293 # enty type related formattings
294 if entrytype in ('book', 'inbook'):
295 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
296 if not entrycont.has_key('author'):
297 entrycont['author'] = entrycont['editor']
298 entrycont['author']['text'] += ', editors'
299 elif entrytype == 'article':
300 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
301 elif entrytype in ('inproceedings', 'incollection', 'conference'):
302 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
303 elif entrytype == 'techreport':
304 if not entrycont.has_key('type'):
305 entrycont['type'] = 'Technical report'
306 elif entrytype == 'mastersthesis':
307 entrycont['type'] = 'Master\'s thesis'
308 elif entrytype == 'phdthesis':
309 entrycont['type'] = 'PhD thesis'
311 for eline in entrycont:
313 eline = latexreplacements(eline)
315 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
316 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
318 if entrycont.has_key('author') and (entrycont['author'] != ''):
319 entry.append(entrycont['author']['text'] + '.')
320 if entrycont.has_key('title') and (entrycont['title'] != ''):
321 entry.append(entrycont['title'] + '.')
322 if entrycont.has_key('journal') and (entrycont['journal'] != ''):
323 entry.append(entrycont['journal'] + ',')
324 if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
325 entry.append('In ' + entrycont['booktitle'] + ',')
326 if entrycont.has_key('type') and (entrycont['type'] != ''):
327 eline = entrycont['type']
328 if entrycont.has_key('number') and (entrycont['number'] != ''):
329 eline += ' ' + entrycont['number']
332 if entrycont.has_key('institution') and (entrycont['institution'] != ''):
333 entry.append(entrycont['institution'] + ',')
334 if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
335 entry.append(entrycont['publisher'] + ',')
336 if entrycont.has_key('school') and (entrycont['school'] != ''):
337 entry.append(entrycont['school'] + ',')
338 if entrycont.has_key('address') and (entrycont['address'] != ''):
339 entry.append(entrycont['address'] + ',')
340 if entrycont.has_key('edition') and (entrycont['edition'] != ''):
341 entry.append(entrycont['edition'] + ' edition,')
342 if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
343 entry.append(entrycont['howpublished'] + ',')
344 if entrycont.has_key('volume') and (entrycont['volume'] != ''):
345 eline = entrycont['volume'];
346 if entrycont.has_key('number') and (entrycont['number'] != ''):
347 eline += '(' + entrycont['number'] + ')'
348 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
349 eline += ':' + entrycont['pages']
353 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
354 entry.append('pages ' + entrycont['pages'] + ',')
355 if entrycont.has_key('year') and (entrycont['year'] != ''):
356 if entrycont.has_key('month') and (entrycont['month'] != ''):
357 entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
359 entry.append(entrycont['year'] + '.')
360 if entrycont.has_key('note') and (entrycont['note'] != ''):
361 entry.append(entrycont['note'] + '.')
363 # generate keys for sorting and for the output
366 if entrycont.has_key('author'):
367 for author in entrycont['author']['list']:
368 sortkey += copychars(author, author.rfind(' ')+1, len(author))
369 bibkey = entrycont['author']['abbrev']
372 if entrycont.has_key('year'):
373 sortkey += entrycont['year']
374 bibkey += entrycont['year'][-2:]
375 if entrycont.has_key('title'):
376 sortkey += entrycont['title']
377 if entrycont.has_key('key'):
378 sortkey = entrycont['key'] + sortkey
379 bibkey = entrycont['key']
380 entry.insert(0, sortkey)
381 entry.insert(1, bibkey)
383 # add the entry to the file contents
384 filecont.append(entry)
387 # field, publication info
391 # field = {data} entries
392 if bracedata_rex.match(line):
393 field = bracefield_rex.sub('\g<1>', line)
394 field = string.lower(field)
395 data = bracedata_rex.sub('\g<2>', line)
397 # field = "data" entries
398 elif quotedata_rex.match(line):
399 field = quotefield_rex.sub('\g<1>', line)
400 field = string.lower(field)
401 data = quotedata_rex.sub('\g<2>', line)
403 # field = data entries
404 elif data_rex.match(line):
405 field = field_rex.sub('\g<1>', line)
406 field = string.lower(field)
407 data = data_rex.sub('\g<2>', line)
409 if field in ('author', 'editor'):
410 entrycont[field] = bibtexauthor(data)
412 elif field == 'title':
413 line = bibtextitle(data, entrytype)
415 line = removebraces(transformurls(data.strip()))
418 line = latexreplacements(line)
419 entrycont[field] = line
423 filecont.sort(entry_cmp)
425 # count the bibtex keys
428 for entry in filecont:
430 if not keytable.has_key(bibkey):
433 keytable[bibkey] += 1
435 for bibkey in keytable.keys():
436 counttable[bibkey] = 0
439 for entry in filecont:
440 # generate output key form the bibtex key
442 if keytable[bibkey] == 1:
445 outkey = bibkey + chr(97 + counttable[bibkey])
446 counttable[bibkey] += 1
448 # append the entry code to the output
449 file.append('<tr valign="top">\n' + \
450 '<td>[' + outkey + ']</td>')
452 file.append('\\anchor ' + outkey)
453 for line in entry[2:]:
455 file.append('</td>\n</tr>')
462 # return 1 iff abbr is in line but not inside braces or quotes
463 # assumes that abbr appears only once on the line (out of braces and quotes)
465 def verify_out_of_braces(line, abbr):
467 phrase_split = delimiter_rex.split(line)
469 abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
474 for phrase in phrase_split:
476 open_brace = open_brace + 1
478 open_brace = open_brace - 1
484 elif abbr_rex.search(phrase):
485 if open_brace == 0 and open_quote == 0:
492 # a line in the form phrase1 # phrase2 # ... # phrasen
493 # is returned as phrase1 phrase2 ... phrasen
494 # with the correct punctuation
495 # Bug: Doesn't always work with multiple abbreviations plugged in
497 def concat_line(line):
498 # only look at part after equals
499 field = field_rex.sub('\g<1>',line)
500 rest = field_rex.sub('\g<2>',line)
502 concat_line = field + ' ='
504 pound_split = concatsplit_rex.split(rest)
507 length = len(pound_split)
509 for phrase in pound_split:
510 phrase = phrase.strip()
511 if phrase_count != 0:
512 if phrase.startswith('"') or phrase.startswith('{'):
514 elif phrase.startswith('"'):
515 phrase = phrase.replace('"','{',1)
517 if phrase_count != length-1:
518 if phrase.endswith('"') or phrase.endswith('}'):
521 if phrase.endswith('"'):
523 phrase = phrase + "}"
524 elif phrase.endswith('",'):
526 phrase = phrase + "},"
528 # if phrase did have \#, add the \# back
529 if phrase.endswith('\\'):
530 phrase = phrase + "#"
531 concat_line = concat_line + ' ' + phrase
533 phrase_count = phrase_count + 1
539 # substitute abbreviations into filecont
540 # @param filecont_source - string of data from file
542 def bibtex_replace_abbreviations(filecont_source):
543 filecont = filecont_source.splitlines()
545 # These are defined in bibtex, so we'll define them too
546 abbr_list = ['jan','feb','mar','apr','may','jun',
547 'jul','aug','sep','oct','nov','dec']
548 value_list = ['January','February','March','April',
549 'May','June','July','August','September',
550 'October','November','December']
559 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
560 total_abbr_count = total_abbr_count + 1
563 abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
566 comment_rex = re.compile('@comment\s*{',re.I)
567 preamble_rex = re.compile('@preamble\s*{',re.I)
569 waiting_for_end_string = 0
573 for line in filecont:
574 if line == ' ' or line == '':
577 if waiting_for_end_string:
578 if re.search('}',line):
579 waiting_for_end_string = 0
582 if abbrdef_rex.search(line):
583 abbr = abbrdef_rex.sub('\g<1>', line)
585 if abbr_list.count(abbr) == 0:
586 val = abbrdef_rex.sub('\g<2>', line)
587 abbr_list.append(abbr)
588 value_list.append(string.strip(val))
589 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
590 total_abbr_count = total_abbr_count + 1
591 waiting_for_end_string = 1
594 if comment_rex.search(line):
595 waiting_for_end_string = 1
598 if preamble_rex.search(line):
599 waiting_for_end_string = 1
603 # replace subsequent abbreviations with the value
608 if abbr_rex[abbr_count].search(line):
609 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
610 line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
611 # Check for # concatenations
612 if concatsplit_rex.search(line):
613 line = concat_line(line)
614 abbr_count = abbr_count + 1
617 filecont2 = filecont2 + line + '\n'
621 # Do one final pass over file
623 # make sure that didn't end up with {" or }" after the substitution
624 filecont2 = filecont2.replace('{"','{{')
625 filecont2 = filecont2.replace('"}','}}')
627 afterquotevalue_rex = re.compile('"\s*,\s*')
628 afterbrace_rex = re.compile('"\s*}')
629 afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
631 # add new lines to data that changed because of abbreviation substitutions
632 filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
633 filecont2 = afterbrace_rex.sub('"\n}', filecont2)
634 filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
639 # convert @type( ... ) to @type{ ... }
641 def no_outer_parens(filecont):
643 # do checking for open parens
644 # will convert to braces
645 paren_split = re.split('([(){}])',filecont)
654 at_rex = re.compile('@\w*')
656 for phrase in paren_split:
660 open_paren_count = open_paren_count + 1
666 open_paren_count = open_paren_count + 1
669 open_paren_count = open_paren_count - 1
670 if open_type == 1 and open_paren_count == 0:
674 elif at_rex.search( phrase ):
678 filecont = filecont + phrase
684 # make all whitespace into just one space
685 # format the bibtex file into a usable form.
687 def bibtexwasher(filecont_source):
689 space_rex = re.compile('\s+')
690 comment_rex = re.compile('\s*%')
694 # remove trailing and excessive whitespace
696 for line in filecont_source:
697 line = string.strip(line)
698 line = space_rex.sub(' ', line)
700 if not comment_rex.match(line) and line != '':
701 filecont.append(' '+ line)
703 filecont = string.join(filecont, '')
705 # the file is in one long string
707 filecont = no_outer_parens(filecont)
710 # split lines according to preferred syntax scheme
712 filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
714 # add new lines after commas that are after values
715 filecont = re.sub('"\s*,', '",\n', filecont)
716 filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
717 filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
718 '\n\n\g<1>\g<2>,\n', filecont)
720 # add new lines after }
721 filecont = re.sub('"\s*}','"\n}\n', filecont)
722 filecont = re.sub('}\s*,','},\n', filecont)
725 filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
727 # character encoding, reserved latex characters
728 filecont = re.sub('{\\\&}', '&', filecont)
729 filecont = re.sub('\\\&', '&', filecont)
731 # do checking for open braces to get format correct
733 brace_split = re.split('([{}])',filecont)
738 for phrase in brace_split:
740 open_brace_count = open_brace_count + 1
742 open_brace_count = open_brace_count - 1
743 if open_brace_count == 0:
744 filecont = filecont + '\n'
746 filecont = filecont + phrase
748 filecont2 = bibtex_replace_abbreviations(filecont)
751 filecont = filecont2.splitlines()
753 j=0 # count the number of blank lines
754 for line in filecont:
756 if line == '' or line == ' ':
759 filecont[i] = line + '\n'
762 # get rid of the extra stuff at the end of the array
763 # (The extra stuff are duplicates that are in the array because
764 # blank lines were removed.)
765 length = len( filecont)
766 filecont[length-j:length] = []
771 def filehandler(filepath):
773 fd = open(filepath, 'r')
774 filecont_source = fd.readlines()
777 print 'Could not open file:', filepath
778 washeddata = bibtexwasher(filecont_source)
779 outdata = bibtexdecoder(washeddata)
781 print '\page references References'
783 print '<table border="0" cellspacing="5px" width="100%">'
797 filepath = sys.argv[1]
799 print "No input file"
801 filehandler(filepath)
803 if __name__ == "__main__": main()