1 #!/usr/bin/env /usr/local/Python/bin/python2.1
3 BibTeX to Doxygen converter
4 Usage: python bib2dox.py bibfile.bib > bibfile.dox
6 This code is the modification of the BibTeX to XML converter
7 by Vidar Bronken Gundersen et al. See the original copyright notices below.
9 **********************************************************************
11 Decoder for bibliographic data, BibTeX
12 Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
15 (c)2002-06-23 Vidar Bronken Gundersen
16 http://bibtexml.sf.net/
17 Reuse approved as long as this notification is kept.
20 Contributions/thanks to:
21 Egon Willighagen, http://sf.net/projects/jreferences/
22 Richard Mahoney (for providing a test case)
24 Editted by Sara Sprenkle to be more robust and handle more bibtex features.
27 1. Changed bibtex: tags to bibxml: tags.
28 2. Use xmlns:bibxml="http://bibtexml.sf.net/"
29 3. Allow spaces between @type and first {
30 4. "author" fields with multiple authors split by " and "
31 are put in separate xml "bibxml:author" tags.
32 5. Option for Titles: words are capitalized
33 only if first letter in title or capitalized inside braces
34 6. Removes braces from within field values
35 7. Ignores comments in bibtex file (including @comment{ or % )
36 8. Replaces some special latex tags, e.g., replaces ~ with ' '
37 9. Handles bibtex @string abbreviations
38 --> includes bibtex's default abbreviations for months
39 --> does concatenation of abbr # " more " and " more " # abbr
40 10. Handles @type( ... ) or @type{ ... }
41 11. The keywords field is split on , or ; and put into separate xml
42 "bibxml:keywords" tags
46 1. Does not transform Latex encoding like math mode and special
48 2. Does not parse author fields into first and last names.
49 E.g., It does not do anything special to an author whose name is
50 in the form LAST_NAME, FIRST_NAME
51 In "author" tag, will show up as
52 <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
53 3. Does not handle "crossref" fields other than to print
54 <bibxml:crossref>...</bibxml:crossref>
55 4. Does not inform user of the input's format errors. You just won't
56 be able to transform the file later with XSL
58 You will have to manually edit the XML output if you need to handle
59 these (and unknown) limitations.
65 # set of valid name characters
66 valid_name_chars = '[\w\-:]'
69 # define global regular expression variables
71 author_rex = re.compile('\s+and\s+')
72 rembraces_rex = re.compile('[{}]')
73 capitalize_rex = re.compile('({\w*})')
75 # used by bibtexkeywords(data)
76 keywords_rex = re.compile('[,;]')
78 # used by concat_line(line)
79 concatsplit_rex = re.compile('\s*#\s*')
81 # split on {, }, or " in verify_out_of_braces
82 delimiter_rex = re.compile('([{}"])',re.I)
84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
87 url_rex = re.compile('\\\url\{([^}]*)\}')
90 # styles for html formatting
92 divstyle = 'margin-top: -4ex; margin-left: 8em;'
95 # return the string parameter without braces
97 def transformurls(str):
98 return url_rex.sub(r'<a href="\1">\1</a>', str)
101 # return the string parameter without braces
103 def removebraces(str):
104 return rembraces_rex.sub('', str)
107 # latex-specific replacements
108 # (do this after braces were removed)
110 def latexreplacements(line):
111 line = string.replace(line, '~', ' ')
112 line = string.replace(line, '\\\'a', 'á')
113 line = string.replace(line, '\\"a', 'ä')
114 line = string.replace(line, '\\\'e', 'é')
115 line = string.replace(line, '\\"e', 'ë')
116 line = string.replace(line, '\\\'i', 'í')
117 line = string.replace(line, '\\"i', 'ï')
118 line = string.replace(line, '\\\'o', 'ó')
119 line = string.replace(line, '\\"o', 'ö')
120 line = string.replace(line, '\\\'u', 'ú')
121 line = string.replace(line, '\\"u', 'ü')
122 line = string.replace(line, '\\H o', 'õ')
123 line = string.replace(line, '\\H u', 'ü') # ũ does not exist
124 line = string.replace(line, '\\\'A', 'Á')
125 line = string.replace(line, '\\"A', 'Ä')
126 line = string.replace(line, '\\\'E', 'É')
127 line = string.replace(line, '\\"E', 'Ë')
128 line = string.replace(line, '\\\'I', 'Í')
129 line = string.replace(line, '\\"I', 'Ï')
130 line = string.replace(line, '\\\'O', 'Ó')
131 line = string.replace(line, '\\"O', 'Ö')
132 line = string.replace(line, '\\\'U', 'Ú')
133 line = string.replace(line, '\\"U', 'Ü')
134 line = string.replace(line, '\\H O', 'Õ')
135 line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist
140 # copy characters form a string decoding html expressions (&xyz;)
142 def copychars(str, ifrom, count):
147 while (i < len(str)) and (c < count):
156 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
157 ((str[i] >= 'a') and (str[i] <= 'z')):
168 # Handle a list of authors (separated by 'and').
169 # It gives back an array of the follwing values:
170 # - num: the number of authors,
171 # - list: the list of the author names,
172 # - text: the bibtex text (separated by commas and/or 'and')
173 # - abbrev: abbreviation that can be used for indicate the
174 # bibliography entries
176 def bibtexauthor(data):
179 result['list'] = author_rex.split(data)
180 result['num'] = len(result['list'])
181 for i, author in enumerate(result['list']):
182 # general transformations
183 author = latexreplacements(removebraces(author.strip()))
184 # transform "Xyz, A. B." to "A. B. Xyz"
185 pos = author.find(',')
187 author = author[pos+1:].strip() + ' ' + author[:pos].strip()
188 result['list'][i] = author
189 bibtex += author + '#'
191 if result['num'] > 1:
192 ix = bibtex.rfind('#')
193 if result['num'] == 2:
194 bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
196 bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
197 bibtex = bibtex.replace('#', ', ')
198 result['text'] = bibtex
200 result['abbrev'] = ''
201 for author in result['list']:
202 pos = author.rfind(' ') + 1
204 if result['num'] == 1:
206 result['abbrev'] += copychars(author, pos, count)
212 # data = title string
213 # @return the capitalized title (first letter is capitalized), rest are capitalized
214 # only if capitalized inside braces
216 def capitalizetitle(data):
217 title_list = capitalize_rex.split(data)
220 for phrase in title_list:
221 check = string.lstrip(phrase)
223 # keep phrase's capitalization the same
224 if check.find('{') == 0:
225 title += removebraces(phrase)
227 # first word --> capitalize first letter (after spaces)
229 title += check.capitalize()
231 title += phrase.lower()
238 # @return the bibtex for the title
239 # @param data --> title string
240 # braces are removed from title
242 def bibtextitle(data, entrytype):
243 if entrytype in ('book', 'inbook'):
244 title = removebraces(data.strip())
246 title = removebraces(capitalizetitle(data.strip()))
252 # function to compare entry lists
255 return cmp(x[0], y[0])
259 # print the XML for the transformed "filecont_source"
261 def bibtexdecoder(filecont_source):
265 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
266 pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
267 endtype_rex = re.compile('}\s*$')
268 endtag_rex = re.compile('^\s*}\s*$')
270 bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
271 bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
273 quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
274 quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
276 for line in filecont_source:
279 # encode character entities
280 line = string.replace(line, '&', '&')
281 line = string.replace(line, '<', '<')
282 line = string.replace(line, '>', '>')
284 # start entry: publication type (store for later use)
285 if pubtype_rex.match(line):
286 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
289 entrytype = pubtype_rex.sub('\g<1>',line)
290 entrytype = string.lower(entrytype)
291 entryid = pubtype_rex.sub('\g<2>', line)
293 # end entry if just a }
294 elif endtype_rex.match(line):
295 # generate doxygen code for the entry
297 # enty type related formattings
298 if entrytype in ('book', 'inbook'):
299 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
300 if not entrycont.has_key('author'):
301 entrycont['author'] = entrycont['editor']
302 entrycont['author']['text'] += ', editors'
303 elif entrytype == 'article':
304 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
305 elif entrytype in ('inproceedings', 'incollection', 'conference'):
306 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
307 elif entrytype == 'techreport':
308 if not entrycont.has_key('type'):
309 entrycont['type'] = 'Technical report'
310 elif entrytype == 'mastersthesis':
311 entrycont['type'] = 'Master\'s thesis'
312 elif entrytype == 'phdthesis':
313 entrycont['type'] = 'PhD thesis'
315 for eline in entrycont:
317 eline = latexreplacements(eline)
319 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
320 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
322 if entrycont.has_key('author') and (entrycont['author'] != ''):
323 entry.append(entrycont['author']['text'] + '.')
324 if entrycont.has_key('title') and (entrycont['title'] != ''):
325 entry.append(entrycont['title'] + '.')
326 if entrycont.has_key('journal') and (entrycont['journal'] != ''):
327 entry.append(entrycont['journal'] + ',')
328 if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
329 entry.append('In ' + entrycont['booktitle'] + ',')
330 if entrycont.has_key('type') and (entrycont['type'] != ''):
331 eline = entrycont['type']
332 if entrycont.has_key('number') and (entrycont['number'] != ''):
333 eline += ' ' + entrycont['number']
336 if entrycont.has_key('institution') and (entrycont['institution'] != ''):
337 entry.append(entrycont['institution'] + ',')
338 if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
339 entry.append(entrycont['publisher'] + ',')
340 if entrycont.has_key('school') and (entrycont['school'] != ''):
341 entry.append(entrycont['school'] + ',')
342 if entrycont.has_key('address') and (entrycont['address'] != ''):
343 entry.append(entrycont['address'] + ',')
344 if entrycont.has_key('edition') and (entrycont['edition'] != ''):
345 entry.append(entrycont['edition'] + ' edition,')
346 if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
347 entry.append(entrycont['howpublished'] + ',')
348 if entrycont.has_key('volume') and (entrycont['volume'] != ''):
349 eline = entrycont['volume'];
350 if entrycont.has_key('number') and (entrycont['number'] != ''):
351 eline += '(' + entrycont['number'] + ')'
352 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
353 eline += ':' + entrycont['pages']
357 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
358 entry.append('pages ' + entrycont['pages'] + ',')
359 if entrycont.has_key('year') and (entrycont['year'] != ''):
360 if entrycont.has_key('month') and (entrycont['month'] != ''):
361 entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
363 entry.append(entrycont['year'] + '.')
364 if entrycont.has_key('note') and (entrycont['note'] != ''):
365 entry.append(entrycont['note'] + '.')
367 # generate keys for sorting and for the output
370 if entrycont.has_key('author'):
371 for author in entrycont['author']['list']:
372 sortkey += copychars(author, author.rfind(' ')+1, len(author))
373 bibkey = entrycont['author']['abbrev']
376 if entrycont.has_key('year'):
377 sortkey += entrycont['year']
378 bibkey += entrycont['year'][-2:]
379 if entrycont.has_key('title'):
380 sortkey += entrycont['title']
381 if entrycont.has_key('key'):
382 sortkey = entrycont['key'] + sortkey
383 bibkey = entrycont['key']
384 entry.insert(0, sortkey)
385 entry.insert(1, bibkey)
386 entry.insert(2, entryid)
388 # add the entry to the file contents
389 filecont.append(entry)
392 # field, publication info
396 # field = {data} entries
397 if bracedata_rex.match(line):
398 field = bracefield_rex.sub('\g<1>', line)
399 field = string.lower(field)
400 data = bracedata_rex.sub('\g<2>', line)
402 # field = "data" entries
403 elif quotedata_rex.match(line):
404 field = quotefield_rex.sub('\g<1>', line)
405 field = string.lower(field)
406 data = quotedata_rex.sub('\g<2>', line)
408 # field = data entries
409 elif data_rex.match(line):
410 field = field_rex.sub('\g<1>', line)
411 field = string.lower(field)
412 data = data_rex.sub('\g<2>', line)
414 if field in ('author', 'editor'):
415 entrycont[field] = bibtexauthor(data)
417 elif field == 'title':
418 line = bibtextitle(data, entrytype)
420 line = removebraces(transformurls(data.strip()))
423 line = latexreplacements(line)
424 entrycont[field] = line
428 filecont.sort(entry_cmp)
430 # count the bibtex keys
433 for entry in filecont:
435 if not keytable.has_key(bibkey):
438 keytable[bibkey] += 1
440 for bibkey in keytable.keys():
441 counttable[bibkey] = 0
444 for entry in filecont:
445 # generate output key form the bibtex key
448 if keytable[bibkey] == 1:
451 outkey = bibkey + chr(97 + counttable[bibkey])
452 counttable[bibkey] += 1
454 # append the entry code to the output
455 file.append('\\section ' + entryid + ' [' + outkey + ']')
456 file.append('<div style="' + divstyle + '">')
457 for line in entry[3:]:
459 file.append('</div>')
466 # return 1 iff abbr is in line but not inside braces or quotes
467 # assumes that abbr appears only once on the line (out of braces and quotes)
469 def verify_out_of_braces(line, abbr):
471 phrase_split = delimiter_rex.split(line)
473 abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
478 for phrase in phrase_split:
480 open_brace = open_brace + 1
482 open_brace = open_brace - 1
488 elif abbr_rex.search(phrase):
489 if open_brace == 0 and open_quote == 0:
496 # a line in the form phrase1 # phrase2 # ... # phrasen
497 # is returned as phrase1 phrase2 ... phrasen
498 # with the correct punctuation
499 # Bug: Doesn't always work with multiple abbreviations plugged in
501 def concat_line(line):
502 # only look at part after equals
503 field = field_rex.sub('\g<1>',line)
504 rest = field_rex.sub('\g<2>',line)
506 concat_line = field + ' ='
508 pound_split = concatsplit_rex.split(rest)
511 length = len(pound_split)
513 for phrase in pound_split:
514 phrase = phrase.strip()
515 if phrase_count != 0:
516 if phrase.startswith('"') or phrase.startswith('{'):
518 elif phrase.startswith('"'):
519 phrase = phrase.replace('"','{',1)
521 if phrase_count != length-1:
522 if phrase.endswith('"') or phrase.endswith('}'):
525 if phrase.endswith('"'):
527 phrase = phrase + "}"
528 elif phrase.endswith('",'):
530 phrase = phrase + "},"
532 # if phrase did have \#, add the \# back
533 if phrase.endswith('\\'):
534 phrase = phrase + "#"
535 concat_line = concat_line + ' ' + phrase
537 phrase_count = phrase_count + 1
543 # substitute abbreviations into filecont
544 # @param filecont_source - string of data from file
546 def bibtex_replace_abbreviations(filecont_source):
547 filecont = filecont_source.splitlines()
549 # These are defined in bibtex, so we'll define them too
550 abbr_list = ['jan','feb','mar','apr','may','jun',
551 'jul','aug','sep','oct','nov','dec']
552 value_list = ['January','February','March','April',
553 'May','June','July','August','September',
554 'October','November','December']
563 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
564 total_abbr_count = total_abbr_count + 1
567 abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
570 comment_rex = re.compile('@comment\s*{',re.I)
571 preamble_rex = re.compile('@preamble\s*{',re.I)
573 waiting_for_end_string = 0
577 for line in filecont:
578 if line == ' ' or line == '':
581 if waiting_for_end_string:
582 if re.search('}',line):
583 waiting_for_end_string = 0
586 if abbrdef_rex.search(line):
587 abbr = abbrdef_rex.sub('\g<1>', line)
589 if abbr_list.count(abbr) == 0:
590 val = abbrdef_rex.sub('\g<2>', line)
591 abbr_list.append(abbr)
592 value_list.append(string.strip(val))
593 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
594 total_abbr_count = total_abbr_count + 1
595 waiting_for_end_string = 1
598 if comment_rex.search(line):
599 waiting_for_end_string = 1
602 if preamble_rex.search(line):
603 waiting_for_end_string = 1
607 # replace subsequent abbreviations with the value
612 if abbr_rex[abbr_count].search(line):
613 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
614 line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
615 # Check for # concatenations
616 if concatsplit_rex.search(line):
617 line = concat_line(line)
618 abbr_count = abbr_count + 1
621 filecont2 = filecont2 + line + '\n'
625 # Do one final pass over file
627 # make sure that didn't end up with {" or }" after the substitution
628 filecont2 = filecont2.replace('{"','{{')
629 filecont2 = filecont2.replace('"}','}}')
631 afterquotevalue_rex = re.compile('"\s*,\s*')
632 afterbrace_rex = re.compile('"\s*}')
633 afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
635 # add new lines to data that changed because of abbreviation substitutions
636 filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
637 filecont2 = afterbrace_rex.sub('"\n}', filecont2)
638 filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
643 # convert @type( ... ) to @type{ ... }
645 def no_outer_parens(filecont):
647 # do checking for open parens
648 # will convert to braces
649 paren_split = re.split('([(){}])',filecont)
658 at_rex = re.compile('@\w*')
660 for phrase in paren_split:
664 open_paren_count = open_paren_count + 1
670 open_paren_count = open_paren_count + 1
673 open_paren_count = open_paren_count - 1
674 if open_type == 1 and open_paren_count == 0:
678 elif at_rex.search( phrase ):
682 filecont = filecont + phrase
688 # make all whitespace into just one space
689 # format the bibtex file into a usable form.
691 def bibtexwasher(filecont_source):
693 space_rex = re.compile('\s+')
694 comment_rex = re.compile('\s*%')
698 # remove trailing and excessive whitespace
700 for line in filecont_source:
701 line = string.strip(line)
702 line = space_rex.sub(' ', line)
704 if not comment_rex.match(line) and line != '':
705 filecont.append(' '+ line)
707 filecont = string.join(filecont, '')
709 # the file is in one long string
711 filecont = no_outer_parens(filecont)
714 # split lines according to preferred syntax scheme
716 filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
718 # add new lines after commas that are after values
719 filecont = re.sub('"\s*,', '",\n', filecont)
720 filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
721 filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
722 '\n\n\g<1>\g<2>,\n', filecont)
724 # add new lines after }
725 filecont = re.sub('"\s*}','"\n}\n', filecont)
726 filecont = re.sub('}\s*,','},\n', filecont)
729 filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
731 # character encoding, reserved latex characters
732 filecont = re.sub('{\\\&}', '&', filecont)
733 filecont = re.sub('\\\&', '&', filecont)
735 # do checking for open braces to get format correct
737 brace_split = re.split('([{}])',filecont)
742 for phrase in brace_split:
744 open_brace_count = open_brace_count + 1
746 open_brace_count = open_brace_count - 1
747 if open_brace_count == 0:
748 filecont = filecont + '\n'
750 filecont = filecont + phrase
752 filecont2 = bibtex_replace_abbreviations(filecont)
755 filecont = filecont2.splitlines()
757 j=0 # count the number of blank lines
758 for line in filecont:
760 if line == '' or line == ' ':
763 filecont[i] = line + '\n'
766 # get rid of the extra stuff at the end of the array
767 # (The extra stuff are duplicates that are in the array because
768 # blank lines were removed.)
769 length = len( filecont)
770 filecont[length-j:length] = []
775 def filehandler(filepath):
777 fd = open(filepath, 'r')
778 filecont_source = fd.readlines()
781 print 'Could not open file:', filepath
782 washeddata = bibtexwasher(filecont_source)
783 outdata = bibtexdecoder(washeddata)
785 print '\page references References'
797 filepath = sys.argv[1]
799 print "No input file"
801 filehandler(filepath)
803 if __name__ == "__main__": main()