3 BibTeX to Doxygen converter
4 Usage: python bib2dox.py bibfile.bib > bibfile.dox
6 This file is a part of LEMON, a generic C++ optimization library.
8 **********************************************************************
10 This code is the modification of the BibTeX to XML converter
11 by Vidar Bronken Gundersen et al.
12 See the original copyright notices below.
14 **********************************************************************
16 Decoder for bibliographic data, BibTeX
17 Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
20 (c)2002-06-23 Vidar Bronken Gundersen
21 http://bibtexml.sf.net/
22 Reuse approved as long as this notification is kept.
25 Contributions/thanks to:
26 Egon Willighagen, http://sf.net/projects/jreferences/
27 Richard Mahoney (for providing a test case)
29 Editted by Sara Sprenkle to be more robust and handle more bibtex features.
32 1. Changed bibtex: tags to bibxml: tags.
33 2. Use xmlns:bibxml="http://bibtexml.sf.net/"
34 3. Allow spaces between @type and first {
35 4. "author" fields with multiple authors split by " and "
36 are put in separate xml "bibxml:author" tags.
37 5. Option for Titles: words are capitalized
38 only if first letter in title or capitalized inside braces
39 6. Removes braces from within field values
40 7. Ignores comments in bibtex file (including @comment{ or % )
41 8. Replaces some special latex tags, e.g., replaces ~ with ' '
42 9. Handles bibtex @string abbreviations
43 --> includes bibtex's default abbreviations for months
44 --> does concatenation of abbr # " more " and " more " # abbr
45 10. Handles @type( ... ) or @type{ ... }
46 11. The keywords field is split on , or ; and put into separate xml
47 "bibxml:keywords" tags
51 1. Does not transform Latex encoding like math mode and special
53 2. Does not parse author fields into first and last names.
54 E.g., It does not do anything special to an author whose name is
55 in the form LAST_NAME, FIRST_NAME
56 In "author" tag, will show up as
57 <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
58 3. Does not handle "crossref" fields other than to print
59 <bibxml:crossref>...</bibxml:crossref>
60 4. Does not inform user of the input's format errors. You just won't
61 be able to transform the file later with XSL
63 You will have to manually edit the XML output if you need to handle
64 these (and unknown) limitations.
70 # set of valid name characters
71 valid_name_chars = '[\w\-:]'
74 # define global regular expression variables
76 author_rex = re.compile('\s+and\s+')
77 rembraces_rex = re.compile('[{}]')
78 capitalize_rex = re.compile('({[^}]*})')
80 # used by bibtexkeywords(data)
81 keywords_rex = re.compile('[,;]')
83 # used by concat_line(line)
84 concatsplit_rex = re.compile('\s*#\s*')
86 # split on {, }, or " in verify_out_of_braces
87 delimiter_rex = re.compile('([{}"])',re.I)
89 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
90 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
92 url_rex = re.compile('\\\url\{([^}]*)\}')
95 # styles for html formatting
97 divstyle = 'margin-top: -4ex; margin-left: 8em;'
100 # return the string parameter without braces
102 def transformurls(str):
103 return url_rex.sub(r'<a href="\1">\1</a>', str)
106 # return the string parameter without braces
108 def removebraces(str):
109 return rembraces_rex.sub('', str)
112 # latex-specific replacements
113 # (do this after braces were removed)
115 def latexreplacements(line):
116 line = string.replace(line, '~', ' ')
117 line = string.replace(line, '\\\'a', 'á')
118 line = string.replace(line, '\\"a', 'ä')
119 line = string.replace(line, '\\\'e', 'é')
120 line = string.replace(line, '\\"e', 'ë')
121 line = string.replace(line, '\\\'i', 'í')
122 line = string.replace(line, '\\"i', 'ï')
123 line = string.replace(line, '\\\'o', 'ó')
124 line = string.replace(line, '\\"o', 'ö')
125 line = string.replace(line, '\\\'u', 'ú')
126 line = string.replace(line, '\\"u', 'ü')
127 line = string.replace(line, '\\H o', 'õ')
128 line = string.replace(line, '\\H u', 'ü') # ũ does not exist
129 line = string.replace(line, '\\\'A', 'Á')
130 line = string.replace(line, '\\"A', 'Ä')
131 line = string.replace(line, '\\\'E', 'É')
132 line = string.replace(line, '\\"E', 'Ë')
133 line = string.replace(line, '\\\'I', 'Í')
134 line = string.replace(line, '\\"I', 'Ï')
135 line = string.replace(line, '\\\'O', 'Ó')
136 line = string.replace(line, '\\"O', 'Ö')
137 line = string.replace(line, '\\\'U', 'Ú')
138 line = string.replace(line, '\\"U', 'Ü')
139 line = string.replace(line, '\\H O', 'Õ')
140 line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist
145 # copy characters form a string decoding html expressions (&xyz;)
147 def copychars(str, ifrom, count):
152 while (i < len(str)) and (c < count):
161 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
162 ((str[i] >= 'a') and (str[i] <= 'z')):
173 # Handle a list of authors (separated by 'and').
174 # It gives back an array of the follwing values:
175 # - num: the number of authors,
176 # - list: the list of the author names,
177 # - text: the bibtex text (separated by commas and/or 'and')
178 # - abbrev: abbreviation that can be used for indicate the
179 # bibliography entries
181 def bibtexauthor(data):
184 result['list'] = author_rex.split(data)
185 result['num'] = len(result['list'])
186 for i, author in enumerate(result['list']):
187 # general transformations
188 author = latexreplacements(removebraces(author.strip()))
189 # transform "Xyz, A. B." to "A. B. Xyz"
190 pos = author.find(',')
192 author = author[pos+1:].strip() + ' ' + author[:pos].strip()
193 result['list'][i] = author
194 bibtex += author + '#'
196 if result['num'] > 1:
197 ix = bibtex.rfind('#')
198 if result['num'] == 2:
199 bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
201 bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
202 bibtex = bibtex.replace('#', ', ')
203 result['text'] = bibtex
205 result['abbrev'] = ''
206 for author in result['list']:
207 pos = author.rfind(' ') + 1
209 if result['num'] == 1:
211 result['abbrev'] += copychars(author, pos, count)
217 # data = title string
218 # @return the capitalized title (first letter is capitalized), rest are capitalized
219 # only if capitalized inside braces
221 def capitalizetitle(data):
222 title_list = capitalize_rex.split(data)
225 for phrase in title_list:
226 check = string.lstrip(phrase)
228 # keep phrase's capitalization the same
229 if check.find('{') == 0:
230 title += removebraces(phrase)
232 # first word --> capitalize first letter (after spaces)
234 title += check.capitalize()
236 title += phrase.lower()
243 # @return the bibtex for the title
244 # @param data --> title string
245 # braces are removed from title
247 def bibtextitle(data, entrytype):
248 if entrytype in ('book', 'inbook'):
249 title = removebraces(data.strip())
251 title = removebraces(capitalizetitle(data.strip()))
257 # function to compare entry lists
260 return cmp(x[0], y[0])
264 # print the XML for the transformed "filecont_source"
266 def bibtexdecoder(filecont_source):
270 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
271 pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
272 endtype_rex = re.compile('}\s*$')
273 endtag_rex = re.compile('^\s*}\s*$')
275 bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
276 bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
278 quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
279 quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
281 for line in filecont_source:
284 # encode character entities
285 line = string.replace(line, '&', '&')
286 line = string.replace(line, '<', '<')
287 line = string.replace(line, '>', '>')
289 # start entry: publication type (store for later use)
290 if pubtype_rex.match(line):
291 # want @<alphanumeric chars><spaces>{<spaces><any chars>,
294 entrytype = pubtype_rex.sub('\g<1>',line)
295 entrytype = string.lower(entrytype)
296 entryid = pubtype_rex.sub('\g<2>', line)
298 # end entry if just a }
299 elif endtype_rex.match(line):
300 # generate doxygen code for the entry
302 # enty type related formattings
303 if entrytype in ('book', 'inbook'):
304 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
305 if not entrycont.has_key('author'):
306 entrycont['author'] = entrycont['editor']
307 entrycont['author']['text'] += ', editors'
308 elif entrytype == 'article':
309 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
310 elif entrytype in ('inproceedings', 'incollection', 'conference'):
311 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
312 elif entrytype == 'techreport':
313 if not entrycont.has_key('type'):
314 entrycont['type'] = 'Technical report'
315 elif entrytype == 'mastersthesis':
316 entrycont['type'] = 'Master\'s thesis'
317 elif entrytype == 'phdthesis':
318 entrycont['type'] = 'PhD thesis'
320 for eline in entrycont:
322 eline = latexreplacements(eline)
324 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
325 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
327 if entrycont.has_key('author') and (entrycont['author'] != ''):
328 entry.append(entrycont['author']['text'] + '.')
329 if entrycont.has_key('title') and (entrycont['title'] != ''):
330 entry.append(entrycont['title'] + '.')
331 if entrycont.has_key('journal') and (entrycont['journal'] != ''):
332 entry.append(entrycont['journal'] + ',')
333 if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
334 entry.append('In ' + entrycont['booktitle'] + ',')
335 if entrycont.has_key('type') and (entrycont['type'] != ''):
336 eline = entrycont['type']
337 if entrycont.has_key('number') and (entrycont['number'] != ''):
338 eline += ' ' + entrycont['number']
341 if entrycont.has_key('institution') and (entrycont['institution'] != ''):
342 entry.append(entrycont['institution'] + ',')
343 if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
344 entry.append(entrycont['publisher'] + ',')
345 if entrycont.has_key('school') and (entrycont['school'] != ''):
346 entry.append(entrycont['school'] + ',')
347 if entrycont.has_key('address') and (entrycont['address'] != ''):
348 entry.append(entrycont['address'] + ',')
349 if entrycont.has_key('edition') and (entrycont['edition'] != ''):
350 entry.append(entrycont['edition'] + ' edition,')
351 if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
352 entry.append(entrycont['howpublished'] + ',')
353 if entrycont.has_key('volume') and (entrycont['volume'] != ''):
354 eline = entrycont['volume'];
355 if entrycont.has_key('number') and (entrycont['number'] != ''):
356 eline += '(' + entrycont['number'] + ')'
357 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
358 eline += ':' + entrycont['pages']
362 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
363 entry.append('pages ' + entrycont['pages'] + ',')
364 if entrycont.has_key('year') and (entrycont['year'] != ''):
365 if entrycont.has_key('month') and (entrycont['month'] != ''):
366 entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
368 entry.append(entrycont['year'] + '.')
369 if entrycont.has_key('note') and (entrycont['note'] != ''):
370 entry.append(entrycont['note'] + '.')
371 if entrycont.has_key('url') and (entrycont['url'] != ''):
372 entry.append(entrycont['url'] + '.')
374 # generate keys for sorting and for the output
377 if entrycont.has_key('author'):
378 for author in entrycont['author']['list']:
379 sortkey += copychars(author, author.rfind(' ')+1, len(author))
380 bibkey = entrycont['author']['abbrev']
383 if entrycont.has_key('year'):
384 sortkey += entrycont['year']
385 bibkey += entrycont['year'][-2:]
386 if entrycont.has_key('title'):
387 sortkey += entrycont['title']
388 if entrycont.has_key('key'):
389 sortkey = entrycont['key'] + sortkey
390 bibkey = entrycont['key']
391 entry.insert(0, sortkey)
392 entry.insert(1, bibkey)
393 entry.insert(2, entryid)
395 # add the entry to the file contents
396 filecont.append(entry)
399 # field, publication info
403 # field = {data} entries
404 if bracedata_rex.match(line):
405 field = bracefield_rex.sub('\g<1>', line)
406 field = string.lower(field)
407 data = bracedata_rex.sub('\g<2>', line)
409 # field = "data" entries
410 elif quotedata_rex.match(line):
411 field = quotefield_rex.sub('\g<1>', line)
412 field = string.lower(field)
413 data = quotedata_rex.sub('\g<2>', line)
415 # field = data entries
416 elif data_rex.match(line):
417 field = field_rex.sub('\g<1>', line)
418 field = string.lower(field)
419 data = data_rex.sub('\g<2>', line)
422 data = '\\url{' + data.strip() + '}'
424 if field in ('author', 'editor'):
425 entrycont[field] = bibtexauthor(data)
427 elif field == 'title':
428 line = bibtextitle(data, entrytype)
430 line = removebraces(transformurls(data.strip()))
433 line = latexreplacements(line)
434 entrycont[field] = line
438 filecont.sort(entry_cmp)
440 # count the bibtex keys
443 for entry in filecont:
445 if not keytable.has_key(bibkey):
448 keytable[bibkey] += 1
450 for bibkey in keytable.keys():
451 counttable[bibkey] = 0
454 for entry in filecont:
455 # generate output key form the bibtex key
458 if keytable[bibkey] == 1:
461 outkey = bibkey + chr(97 + counttable[bibkey])
462 counttable[bibkey] += 1
464 # append the entry code to the output
465 file.append('\\section ' + entryid + ' [' + outkey + ']')
466 file.append('<div style="' + divstyle + '">')
467 for line in entry[3:]:
469 file.append('</div>')
476 # return 1 iff abbr is in line but not inside braces or quotes
477 # assumes that abbr appears only once on the line (out of braces and quotes)
479 def verify_out_of_braces(line, abbr):
481 phrase_split = delimiter_rex.split(line)
483 abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
488 for phrase in phrase_split:
490 open_brace = open_brace + 1
492 open_brace = open_brace - 1
498 elif abbr_rex.search(phrase):
499 if open_brace == 0 and open_quote == 0:
506 # a line in the form phrase1 # phrase2 # ... # phrasen
507 # is returned as phrase1 phrase2 ... phrasen
508 # with the correct punctuation
509 # Bug: Doesn't always work with multiple abbreviations plugged in
511 def concat_line(line):
512 # only look at part after equals
513 field = field_rex.sub('\g<1>',line)
514 rest = field_rex.sub('\g<2>',line)
516 concat_line = field + ' ='
518 pound_split = concatsplit_rex.split(rest)
521 length = len(pound_split)
523 for phrase in pound_split:
524 phrase = phrase.strip()
525 if phrase_count != 0:
526 if phrase.startswith('"') or phrase.startswith('{'):
528 elif phrase.startswith('"'):
529 phrase = phrase.replace('"','{',1)
531 if phrase_count != length-1:
532 if phrase.endswith('"') or phrase.endswith('}'):
535 if phrase.endswith('"'):
537 phrase = phrase + "}"
538 elif phrase.endswith('",'):
540 phrase = phrase + "},"
542 # if phrase did have \#, add the \# back
543 if phrase.endswith('\\'):
544 phrase = phrase + "#"
545 concat_line = concat_line + ' ' + phrase
547 phrase_count = phrase_count + 1
553 # substitute abbreviations into filecont
554 # @param filecont_source - string of data from file
556 def bibtex_replace_abbreviations(filecont_source):
557 filecont = filecont_source.splitlines()
559 # These are defined in bibtex, so we'll define them too
560 abbr_list = ['jan','feb','mar','apr','may','jun',
561 'jul','aug','sep','oct','nov','dec']
562 value_list = ['January','February','March','April',
563 'May','June','July','August','September',
564 'October','November','December']
573 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
574 total_abbr_count = total_abbr_count + 1
577 abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
580 comment_rex = re.compile('@comment\s*{',re.I)
581 preamble_rex = re.compile('@preamble\s*{',re.I)
583 waiting_for_end_string = 0
587 for line in filecont:
588 if line == ' ' or line == '':
591 if waiting_for_end_string:
592 if re.search('}',line):
593 waiting_for_end_string = 0
596 if abbrdef_rex.search(line):
597 abbr = abbrdef_rex.sub('\g<1>', line)
599 if abbr_list.count(abbr) == 0:
600 val = abbrdef_rex.sub('\g<2>', line)
601 abbr_list.append(abbr)
602 value_list.append(string.strip(val))
603 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
604 total_abbr_count = total_abbr_count + 1
605 waiting_for_end_string = 1
608 if comment_rex.search(line):
609 waiting_for_end_string = 1
612 if preamble_rex.search(line):
613 waiting_for_end_string = 1
617 # replace subsequent abbreviations with the value
622 if abbr_rex[abbr_count].search(line):
623 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
624 line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
625 # Check for # concatenations
626 if concatsplit_rex.search(line):
627 line = concat_line(line)
628 abbr_count = abbr_count + 1
631 filecont2 = filecont2 + line + '\n'
635 # Do one final pass over file
637 # make sure that didn't end up with {" or }" after the substitution
638 filecont2 = filecont2.replace('{"','{{')
639 filecont2 = filecont2.replace('"}','}}')
641 afterquotevalue_rex = re.compile('"\s*,\s*')
642 afterbrace_rex = re.compile('"\s*}')
643 afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
645 # add new lines to data that changed because of abbreviation substitutions
646 filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
647 filecont2 = afterbrace_rex.sub('"\n}', filecont2)
648 filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
653 # convert @type( ... ) to @type{ ... }
655 def no_outer_parens(filecont):
657 # do checking for open parens
658 # will convert to braces
659 paren_split = re.split('([(){}])',filecont)
668 at_rex = re.compile('@\w*')
670 for phrase in paren_split:
674 open_paren_count = open_paren_count + 1
680 open_paren_count = open_paren_count + 1
683 open_paren_count = open_paren_count - 1
684 if open_type == 1 and open_paren_count == 0:
688 elif at_rex.search( phrase ):
692 filecont = filecont + phrase
698 # make all whitespace into just one space
699 # format the bibtex file into a usable form.
701 def bibtexwasher(filecont_source):
703 space_rex = re.compile('\s+')
704 comment_rex = re.compile('\s*%')
708 # remove trailing and excessive whitespace
710 for line in filecont_source:
711 line = string.strip(line)
712 line = space_rex.sub(' ', line)
714 if not comment_rex.match(line) and line != '':
715 filecont.append(' '+ line)
717 filecont = string.join(filecont, '')
719 # the file is in one long string
721 filecont = no_outer_parens(filecont)
724 # split lines according to preferred syntax scheme
726 filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
728 # add new lines after commas that are after values
729 filecont = re.sub('"\s*,', '",\n', filecont)
730 filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
731 filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
732 '\n\n\g<1>\g<2>,\n', filecont)
734 # add new lines after }
735 filecont = re.sub('"\s*}','"\n}\n', filecont)
736 filecont = re.sub('}\s*,','},\n', filecont)
739 filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
741 # character encoding, reserved latex characters
742 filecont = re.sub('{\\\&}', '&', filecont)
743 filecont = re.sub('\\\&', '&', filecont)
745 # do checking for open braces to get format correct
747 brace_split = re.split('([{}])',filecont)
752 for phrase in brace_split:
754 open_brace_count = open_brace_count + 1
756 open_brace_count = open_brace_count - 1
757 if open_brace_count == 0:
758 filecont = filecont + '\n'
760 filecont = filecont + phrase
762 filecont2 = bibtex_replace_abbreviations(filecont)
765 filecont = filecont2.splitlines()
767 j=0 # count the number of blank lines
768 for line in filecont:
770 if line == '' or line == ' ':
773 filecont[i] = line + '\n'
776 # get rid of the extra stuff at the end of the array
777 # (The extra stuff are duplicates that are in the array because
778 # blank lines were removed.)
779 length = len( filecont)
780 filecont[length-j:length] = []
785 def filehandler(filepath):
787 fd = open(filepath, 'r')
788 filecont_source = fd.readlines()
791 print 'Could not open file:', filepath
792 washeddata = bibtexwasher(filecont_source)
793 outdata = bibtexdecoder(washeddata)
795 print '\page references References'
807 filepath = sys.argv[1]
809 print "No input file"
811 filehandler(filepath)
813 if __name__ == "__main__": main()