1 #!/usr/bin/env /usr/local/Python/bin/python2.1
3 BibTeX to Doxygen converter
4 Usage: python bib2dox.py bibfile.bib > bibfile.dox
6 This code is the modification of the BibTeX to XML converter
7 by Vidar Bronken Gundersen et al. See the original copyright notices below.
9 **********************************************************************
11 Decoder for bibliographic data, BibTeX
12 Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
15 (c)2002-06-23 Vidar Bronken Gundersen
16 http://bibtexml.sf.net/
17 Reuse approved as long as this notification is kept.
20 Contributions/thanks to:
21 Egon Willighagen, http://sf.net/projects/jreferences/
22 Richard Mahoney (for providing a test case)
24 Editted by Sara Sprenkle to be more robust and handle more bibtex features.
27 1. Changed bibtex: tags to bibxml: tags.
28 2. Use xmlns:bibxml="http://bibtexml.sf.net/"
29 3. Allow spaces between @type and first {
30 4. "author" fields with multiple authors split by " and "
31 are put in separate xml "bibxml:author" tags.
32 5. Option for Titles: words are capitalized
33 only if first letter in title or capitalized inside braces
34 6. Removes braces from within field values
35 7. Ignores comments in bibtex file (including @comment{ or % )
36 8. Replaces some special latex tags, e.g., replaces ~ with ' '
37 9. Handles bibtex @string abbreviations
38 --> includes bibtex's default abbreviations for months
39 --> does concatenation of abbr # " more " and " more " # abbr
40 10. Handles @type( ... ) or @type{ ... }
41 11. The keywords field is split on , or ; and put into separate xml
42 "bibxml:keywords" tags
46 1. Does not transform Latex encoding like math mode and special
48 2. Does not parse author fields into first and last names.
49 E.g., It does not do anything special to an author whose name is
50 in the form LAST_NAME, FIRST_NAME
51 In "author" tag, will show up as
52 <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
53 3. Does not handle "crossref" fields other than to print
54 <bibxml:crossref>...</bibxml:crossref>
55 4. Does not inform user of the input's format errors. You just won't
56 be able to transform the file later with XSL
58 You will have to manually edit the XML output if you need to handle
59 these (and unknown) limitations.
65 # set of valid name characters
66 valid_name_chars = '[\w\-:]'
69 # define global regular expression variables
71 author_rex = re.compile('\s+and\s+')
72 rembraces_rex = re.compile('[{}]')
73 capitalize_rex = re.compile('({[^}]*})')
75 # used by bibtexkeywords(data)
76 keywords_rex = re.compile('[,;]')
78 # used by concat_line(line)
79 concatsplit_rex = re.compile('\s*#\s*')
81 # split on {, }, or " in verify_out_of_braces
82 delimiter_rex = re.compile('([{}"])',re.I)
84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
87 url_rex = re.compile('\\\url\{([^}]*)\}')
90 # styles for html formatting
92 divstyle = 'margin-top: -4ex; margin-left: 8em;'
95 # return the string parameter without braces
97 def transformurls(str):
98 return url_rex.sub(r'<a href="\1">\1</a>', str)
101 # return the string parameter without braces
103 def removebraces(str):
104 return rembraces_rex.sub('', str)
107 # latex-specific replacements
108 # (do this after braces were removed)
110 def latexreplacements(line):
111 line = string.replace(line, '~', ' ')
112 line = string.replace(line, '\\\'a', 'á')
113 line = string.replace(line, '\\"a', 'ä')
114 line = string.replace(line, '\\\'e', 'é')
115 line = string.replace(line, '\\"e', 'ë')
116 line = string.replace(line, '\\\'i', 'í')
117 line = string.replace(line, '\\"i', 'ï')
118 line = string.replace(line, '\\\'o', 'ó')
119 line = string.replace(line, '\\"o', 'ö')
120 line = string.replace(line, '\\\'u', 'ú')
121 line = string.replace(line, '\\"u', 'ü')
122 line = string.replace(line, '\\H o', 'õ')
123 line = string.replace(line, '\\H u', 'ü') # ũ does not exist
124 line = string.replace(line, '\\\'A', 'Á')
125 line = string.replace(line, '\\"A', 'Ä')
126 line = string.replace(line, '\\\'E', 'É')
127 line = string.replace(line, '\\"E', 'Ë')
128 line = string.replace(line, '\\\'I', 'Í')
129 line = string.replace(line, '\\"I', 'Ï')
130 line = string.replace(line, '\\\'O', 'Ó')
131 line = string.replace(line, '\\"O', 'Ö')
132 line = string.replace(line, '\\\'U', 'Ú')
133 line = string.replace(line, '\\"U', 'Ü')
134 line = string.replace(line, '\\H O', 'Õ')
135 line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist
140 # copy characters form a string decoding html expressions (&xyz;)
142 def copychars(str, ifrom, count):
147 while (i < len(str)) and (c < count):
156 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
157 ((str[i] >= 'a') and (str[i] <= 'z')):
168 # Handle a list of authors (separated by 'and').
169 # It gives back an array of the follwing values:
170 # - num: the number of authors,
171 # - list: the list of the author names,
172 # - text: the bibtex text (separated by commas and/or 'and')
173 # - abbrev: abbreviation that can be used for indicate the
174 # bibliography entries
176 def bibtexauthor(data):
179 result['list'] = author_rex.split(data)
180 result['num'] = len(result['list'])
181 for i, author in enumerate(result['list']):
182 # general transformations
183 author = latexreplacements(removebraces(author.strip()))
184 # transform "Xyz, A. B." to "A. B. Xyz"
185 pos = author.find(',')
187 author = author[pos+1:].strip() + ' ' + author[:pos].strip()
188 result['list'][i] = author
189 bibtex += author + '#'
191 if result['num'] > 1:
192 ix = bibtex.rfind('#')
193 if result['num'] == 2:
194 bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
196 bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
197 bibtex = bibtex.replace('#', ', ')
198 result['text'] = bibtex
200 result['abbrev'] = ''
201 for author in result['list']:
202 pos = author.rfind(' ') + 1
204 if result['num'] == 1:
206 result['abbrev'] += copychars(author, pos, count)
212 # data = title string
213 # @return the capitalized title (first letter is capitalized), rest are capitalized
214 # only if capitalized inside braces
216 def capitalizetitle(data):
217 title_list = capitalize_rex.split(data)
220 for phrase in title_list:
221 check = string.lstrip(phrase)
223 # keep phrase's capitalization the same
224 if check.find('{') == 0:
225 title += removebraces(phrase)
227 # first word --> capitalize first letter (after spaces)
229 title += check.capitalize()
231 title += phrase.lower()
238 # @return the bibtex for the title
239 # @param data --> title string
240 # braces are removed from title
242 def bibtextitle(data, entrytype):
243 if entrytype in ('book', 'inbook'):
244 title = removebraces(data.strip())
246 title = removebraces(capitalizetitle(data.strip()))
252 # function to compare entry lists
255 return cmp(x[0], y[0])
259 # print the XML for the transformed "filecont_source"
261 def bibtexdecoder(filecont_source):
265 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
266 pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
267 endtype_rex = re.compile('}\s*$')
268 endtag_rex = re.compile('^\s*}\s*$')
270 bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
271 bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
273 quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
274 quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
276 for line in filecont_source:
279 # encode character entities
280 line = string.replace(line, '&', '&')
281 line = string.replace(line, '<', '<')
282 line = string.replace(line, '>', '>')
284 # start entry: publication type (store for later use)
285 if pubtype_rex.match(line):
286 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
289 entrytype = pubtype_rex.sub('\g<1>',line)
290 entrytype = string.lower(entrytype)
291 entryid = pubtype_rex.sub('\g<2>', line)
293 # end entry if just a }
294 elif endtype_rex.match(line):
295 # generate doxygen code for the entry
297 # enty type related formattings
298 if entrytype in ('book', 'inbook'):
299 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
300 if not entrycont.has_key('author'):
301 entrycont['author'] = entrycont['editor']
302 entrycont['author']['text'] += ', editors'
303 elif entrytype == 'article':
304 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
305 elif entrytype in ('inproceedings', 'incollection', 'conference'):
306 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
307 elif entrytype == 'techreport':
308 if not entrycont.has_key('type'):
309 entrycont['type'] = 'Technical report'
310 elif entrytype == 'mastersthesis':
311 entrycont['type'] = 'Master\'s thesis'
312 elif entrytype == 'phdthesis':
313 entrycont['type'] = 'PhD thesis'
315 for eline in entrycont:
317 eline = latexreplacements(eline)
319 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
320 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
322 if entrycont.has_key('author') and (entrycont['author'] != ''):
323 entry.append(entrycont['author']['text'] + '.')
324 if entrycont.has_key('title') and (entrycont['title'] != ''):
325 entry.append(entrycont['title'] + '.')
326 if entrycont.has_key('journal') and (entrycont['journal'] != ''):
327 entry.append(entrycont['journal'] + ',')
328 if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
329 entry.append('In ' + entrycont['booktitle'] + ',')
330 if entrycont.has_key('type') and (entrycont['type'] != ''):
331 eline = entrycont['type']
332 if entrycont.has_key('number') and (entrycont['number'] != ''):
333 eline += ' ' + entrycont['number']
336 if entrycont.has_key('institution') and (entrycont['institution'] != ''):
337 entry.append(entrycont['institution'] + ',')
338 if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
339 entry.append(entrycont['publisher'] + ',')
340 if entrycont.has_key('school') and (entrycont['school'] != ''):
341 entry.append(entrycont['school'] + ',')
342 if entrycont.has_key('address') and (entrycont['address'] != ''):
343 entry.append(entrycont['address'] + ',')
344 if entrycont.has_key('edition') and (entrycont['edition'] != ''):
345 entry.append(entrycont['edition'] + ' edition,')
346 if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
347 entry.append(entrycont['howpublished'] + ',')
348 if entrycont.has_key('volume') and (entrycont['volume'] != ''):
349 eline = entrycont['volume'];
350 if entrycont.has_key('number') and (entrycont['number'] != ''):
351 eline += '(' + entrycont['number'] + ')'
352 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
353 eline += ':' + entrycont['pages']
357 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
358 entry.append('pages ' + entrycont['pages'] + ',')
359 if entrycont.has_key('year') and (entrycont['year'] != ''):
360 if entrycont.has_key('month') and (entrycont['month'] != ''):
361 entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
363 entry.append(entrycont['year'] + '.')
364 if entrycont.has_key('note') and (entrycont['note'] != ''):
365 entry.append(entrycont['note'] + '.')
366 if entrycont.has_key('url') and (entrycont['url'] != ''):
367 entry.append(entrycont['url'] + '.')
369 # generate keys for sorting and for the output
372 if entrycont.has_key('author'):
373 for author in entrycont['author']['list']:
374 sortkey += copychars(author, author.rfind(' ')+1, len(author))
375 bibkey = entrycont['author']['abbrev']
378 if entrycont.has_key('year'):
379 sortkey += entrycont['year']
380 bibkey += entrycont['year'][-2:]
381 if entrycont.has_key('title'):
382 sortkey += entrycont['title']
383 if entrycont.has_key('key'):
384 sortkey = entrycont['key'] + sortkey
385 bibkey = entrycont['key']
386 entry.insert(0, sortkey)
387 entry.insert(1, bibkey)
388 entry.insert(2, entryid)
390 # add the entry to the file contents
391 filecont.append(entry)
394 # field, publication info
398 # field = {data} entries
399 if bracedata_rex.match(line):
400 field = bracefield_rex.sub('\g<1>', line)
401 field = string.lower(field)
402 data = bracedata_rex.sub('\g<2>', line)
404 # field = "data" entries
405 elif quotedata_rex.match(line):
406 field = quotefield_rex.sub('\g<1>', line)
407 field = string.lower(field)
408 data = quotedata_rex.sub('\g<2>', line)
410 # field = data entries
411 elif data_rex.match(line):
412 field = field_rex.sub('\g<1>', line)
413 field = string.lower(field)
414 data = data_rex.sub('\g<2>', line)
417 data = '\\url{' + data.strip() + '}'
419 if field in ('author', 'editor'):
420 entrycont[field] = bibtexauthor(data)
422 elif field == 'title':
423 line = bibtextitle(data, entrytype)
425 line = removebraces(transformurls(data.strip()))
428 line = latexreplacements(line)
429 entrycont[field] = line
433 filecont.sort(entry_cmp)
435 # count the bibtex keys
438 for entry in filecont:
440 if not keytable.has_key(bibkey):
443 keytable[bibkey] += 1
445 for bibkey in keytable.keys():
446 counttable[bibkey] = 0
449 for entry in filecont:
450 # generate output key form the bibtex key
453 if keytable[bibkey] == 1:
456 outkey = bibkey + chr(97 + counttable[bibkey])
457 counttable[bibkey] += 1
459 # append the entry code to the output
460 file.append('\\section ' + entryid + ' [' + outkey + ']')
461 file.append('<div style="' + divstyle + '">')
462 for line in entry[3:]:
464 file.append('</div>')
471 # return 1 iff abbr is in line but not inside braces or quotes
472 # assumes that abbr appears only once on the line (out of braces and quotes)
474 def verify_out_of_braces(line, abbr):
476 phrase_split = delimiter_rex.split(line)
478 abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
483 for phrase in phrase_split:
485 open_brace = open_brace + 1
487 open_brace = open_brace - 1
493 elif abbr_rex.search(phrase):
494 if open_brace == 0 and open_quote == 0:
501 # a line in the form phrase1 # phrase2 # ... # phrasen
502 # is returned as phrase1 phrase2 ... phrasen
503 # with the correct punctuation
504 # Bug: Doesn't always work with multiple abbreviations plugged in
506 def concat_line(line):
507 # only look at part after equals
508 field = field_rex.sub('\g<1>',line)
509 rest = field_rex.sub('\g<2>',line)
511 concat_line = field + ' ='
513 pound_split = concatsplit_rex.split(rest)
516 length = len(pound_split)
518 for phrase in pound_split:
519 phrase = phrase.strip()
520 if phrase_count != 0:
521 if phrase.startswith('"') or phrase.startswith('{'):
523 elif phrase.startswith('"'):
524 phrase = phrase.replace('"','{',1)
526 if phrase_count != length-1:
527 if phrase.endswith('"') or phrase.endswith('}'):
530 if phrase.endswith('"'):
532 phrase = phrase + "}"
533 elif phrase.endswith('",'):
535 phrase = phrase + "},"
537 # if phrase did have \#, add the \# back
538 if phrase.endswith('\\'):
539 phrase = phrase + "#"
540 concat_line = concat_line + ' ' + phrase
542 phrase_count = phrase_count + 1
548 # substitute abbreviations into filecont
549 # @param filecont_source - string of data from file
551 def bibtex_replace_abbreviations(filecont_source):
552 filecont = filecont_source.splitlines()
554 # These are defined in bibtex, so we'll define them too
555 abbr_list = ['jan','feb','mar','apr','may','jun',
556 'jul','aug','sep','oct','nov','dec']
557 value_list = ['January','February','March','April',
558 'May','June','July','August','September',
559 'October','November','December']
568 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
569 total_abbr_count = total_abbr_count + 1
572 abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
575 comment_rex = re.compile('@comment\s*{',re.I)
576 preamble_rex = re.compile('@preamble\s*{',re.I)
578 waiting_for_end_string = 0
582 for line in filecont:
583 if line == ' ' or line == '':
586 if waiting_for_end_string:
587 if re.search('}',line):
588 waiting_for_end_string = 0
591 if abbrdef_rex.search(line):
592 abbr = abbrdef_rex.sub('\g<1>', line)
594 if abbr_list.count(abbr) == 0:
595 val = abbrdef_rex.sub('\g<2>', line)
596 abbr_list.append(abbr)
597 value_list.append(string.strip(val))
598 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
599 total_abbr_count = total_abbr_count + 1
600 waiting_for_end_string = 1
603 if comment_rex.search(line):
604 waiting_for_end_string = 1
607 if preamble_rex.search(line):
608 waiting_for_end_string = 1
612 # replace subsequent abbreviations with the value
617 if abbr_rex[abbr_count].search(line):
618 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
619 line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
620 # Check for # concatenations
621 if concatsplit_rex.search(line):
622 line = concat_line(line)
623 abbr_count = abbr_count + 1
626 filecont2 = filecont2 + line + '\n'
630 # Do one final pass over file
632 # make sure that didn't end up with {" or }" after the substitution
633 filecont2 = filecont2.replace('{"','{{')
634 filecont2 = filecont2.replace('"}','}}')
636 afterquotevalue_rex = re.compile('"\s*,\s*')
637 afterbrace_rex = re.compile('"\s*}')
638 afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
640 # add new lines to data that changed because of abbreviation substitutions
641 filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
642 filecont2 = afterbrace_rex.sub('"\n}', filecont2)
643 filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
648 # convert @type( ... ) to @type{ ... }
650 def no_outer_parens(filecont):
652 # do checking for open parens
653 # will convert to braces
654 paren_split = re.split('([(){}])',filecont)
663 at_rex = re.compile('@\w*')
665 for phrase in paren_split:
669 open_paren_count = open_paren_count + 1
675 open_paren_count = open_paren_count + 1
678 open_paren_count = open_paren_count - 1
679 if open_type == 1 and open_paren_count == 0:
683 elif at_rex.search( phrase ):
687 filecont = filecont + phrase
693 # make all whitespace into just one space
694 # format the bibtex file into a usable form.
696 def bibtexwasher(filecont_source):
698 space_rex = re.compile('\s+')
699 comment_rex = re.compile('\s*%')
703 # remove trailing and excessive whitespace
705 for line in filecont_source:
706 line = string.strip(line)
707 line = space_rex.sub(' ', line)
709 if not comment_rex.match(line) and line != '':
710 filecont.append(' '+ line)
712 filecont = string.join(filecont, '')
714 # the file is in one long string
716 filecont = no_outer_parens(filecont)
719 # split lines according to preferred syntax scheme
721 filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
723 # add new lines after commas that are after values
724 filecont = re.sub('"\s*,', '",\n', filecont)
725 filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
726 filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
727 '\n\n\g<1>\g<2>,\n', filecont)
729 # add new lines after }
730 filecont = re.sub('"\s*}','"\n}\n', filecont)
731 filecont = re.sub('}\s*,','},\n', filecont)
734 filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
736 # character encoding, reserved latex characters
737 filecont = re.sub('{\\\&}', '&', filecont)
738 filecont = re.sub('\\\&', '&', filecont)
740 # do checking for open braces to get format correct
742 brace_split = re.split('([{}])',filecont)
747 for phrase in brace_split:
749 open_brace_count = open_brace_count + 1
751 open_brace_count = open_brace_count - 1
752 if open_brace_count == 0:
753 filecont = filecont + '\n'
755 filecont = filecont + phrase
757 filecont2 = bibtex_replace_abbreviations(filecont)
760 filecont = filecont2.splitlines()
762 j=0 # count the number of blank lines
763 for line in filecont:
765 if line == '' or line == ' ':
768 filecont[i] = line + '\n'
771 # get rid of the extra stuff at the end of the array
772 # (The extra stuff are duplicates that are in the array because
773 # blank lines were removed.)
774 length = len( filecont)
775 filecont[length-j:length] = []
780 def filehandler(filepath):
782 fd = open(filepath, 'r')
783 filecont_source = fd.readlines()
786 print 'Could not open file:', filepath
787 washeddata = bibtexwasher(filecont_source)
788 outdata = bibtexdecoder(washeddata)
790 print '\page references References'
802 filepath = sys.argv[1]
804 print "No input file"
806 filehandler(filepath)
808 if __name__ == "__main__": main()