diff -r 7c4ba7daaf5f -r 2b6bffe0e7e8 scripts/bib2dox.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/bib2dox.py Tue Dec 20 18:15:14 2011 +0100 @@ -0,0 +1,816 @@ +#! /usr/bin/env python +""" + BibTeX to Doxygen converter + Usage: python bib2dox.py bibfile.bib > bibfile.dox + + This file is a part of LEMON, a generic C++ optimization library. + + ********************************************************************** + + This code is the modification of the BibTeX to XML converter + by Vidar Bronken Gundersen et al. + See the original copyright notices below. + + ********************************************************************** + + Decoder for bibliographic data, BibTeX + Usage: python bibtex2xml.py bibfile.bib > bibfile.xml + + v.8 + (c)2002-06-23 Vidar Bronken Gundersen + http://bibtexml.sf.net/ + Reuse approved as long as this notification is kept. + Licence: GPL. + + Contributions/thanks to: + Egon Willighagen, http://sf.net/projects/jreferences/ + Richard Mahoney (for providing a test case) + + Editted by Sara Sprenkle to be more robust and handle more bibtex features. + (c) 2003-01-15 + + 1. Changed bibtex: tags to bibxml: tags. + 2. Use xmlns:bibxml="http://bibtexml.sf.net/" + 3. Allow spaces between @type and first { + 4. "author" fields with multiple authors split by " and " + are put in separate xml "bibxml:author" tags. + 5. Option for Titles: words are capitalized + only if first letter in title or capitalized inside braces + 6. Removes braces from within field values + 7. Ignores comments in bibtex file (including @comment{ or % ) + 8. Replaces some special latex tags, e.g., replaces ~ with ' ' + 9. Handles bibtex @string abbreviations + --> includes bibtex's default abbreviations for months + --> does concatenation of abbr # " more " and " more " # abbr + 10. Handles @type( ... ) or @type{ ... } + 11. The keywords field is split on , or ; and put into separate xml + "bibxml:keywords" tags + 12. Ignores @preamble + + Known Limitations + 1. Does not transform Latex encoding like math mode and special + latex symbols. + 2. Does not parse author fields into first and last names. + E.g., It does not do anything special to an author whose name is + in the form LAST_NAME, FIRST_NAME + In "author" tag, will show up as + LAST_NAME, FIRST_NAME + 3. Does not handle "crossref" fields other than to print + ... + 4. Does not inform user of the input's format errors. You just won't + be able to transform the file later with XSL + + You will have to manually edit the XML output if you need to handle + these (and unknown) limitations. + +""" + +import string, re + +# set of valid name characters +valid_name_chars = '[\w\-:]' + +# +# define global regular expression variables +# +author_rex = re.compile('\s+and\s+') +rembraces_rex = re.compile('[{}]') +capitalize_rex = re.compile('({[^}]*})') + +# used by bibtexkeywords(data) +keywords_rex = re.compile('[,;]') + +# used by concat_line(line) +concatsplit_rex = re.compile('\s*#\s*') + +# split on {, }, or " in verify_out_of_braces +delimiter_rex = re.compile('([{}"])',re.I) + +field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') +data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') + +url_rex = re.compile('\\\url\{([^}]*)\}') + +# +# styles for html formatting +# +divstyle = 'margin-top: -4ex; margin-left: 8em;' + +# +# return the string parameter without braces +# +def transformurls(str): + return url_rex.sub(r'\1', str) + +# +# return the string parameter without braces +# +def removebraces(str): + return rembraces_rex.sub('', str) + +# +# latex-specific replacements +# (do this after braces were removed) +# +def latexreplacements(line): + line = string.replace(line, '~', ' ') + line = string.replace(line, '\\\'a', 'á') + line = string.replace(line, '\\"a', 'ä') + line = string.replace(line, '\\\'e', 'é') + line = string.replace(line, '\\"e', 'ë') + line = string.replace(line, '\\\'i', 'í') + line = string.replace(line, '\\"i', 'ï') + line = string.replace(line, '\\\'o', 'ó') + line = string.replace(line, '\\"o', 'ö') + line = string.replace(line, '\\\'u', 'ú') + line = string.replace(line, '\\"u', 'ü') + line = string.replace(line, '\\H o', 'õ') + line = string.replace(line, '\\H u', 'ü') # ũ does not exist + line = string.replace(line, '\\\'A', 'Á') + line = string.replace(line, '\\"A', 'Ä') + line = string.replace(line, '\\\'E', 'É') + line = string.replace(line, '\\"E', 'Ë') + line = string.replace(line, '\\\'I', 'Í') + line = string.replace(line, '\\"I', 'Ï') + line = string.replace(line, '\\\'O', 'Ó') + line = string.replace(line, '\\"O', 'Ö') + line = string.replace(line, '\\\'U', 'Ú') + line = string.replace(line, '\\"U', 'Ü') + line = string.replace(line, '\\H O', 'Õ') + line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist + + return line + +# +# copy characters form a string decoding html expressions (&xyz;) +# +def copychars(str, ifrom, count): + result = '' + i = ifrom + c = 0 + html_spec = False + while (i < len(str)) and (c < count): + if str[i] == '&': + html_spec = True; + if i+1 < len(str): + result += str[i+1] + c += 1 + i += 2 + else: + if not html_spec: + if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ + ((str[i] >= 'a') and (str[i] <= 'z')): + result += str[i] + c += 1 + elif str[i] == ';': + html_spec = False; + i += 1 + + return result + + +# +# Handle a list of authors (separated by 'and'). +# It gives back an array of the follwing values: +# - num: the number of authors, +# - list: the list of the author names, +# - text: the bibtex text (separated by commas and/or 'and') +# - abbrev: abbreviation that can be used for indicate the +# bibliography entries +# +def bibtexauthor(data): + result = {} + bibtex = '' + result['list'] = author_rex.split(data) + result['num'] = len(result['list']) + for i, author in enumerate(result['list']): + # general transformations + author = latexreplacements(removebraces(author.strip())) + # transform "Xyz, A. B." to "A. B. Xyz" + pos = author.find(',') + if pos != -1: + author = author[pos+1:].strip() + ' ' + author[:pos].strip() + result['list'][i] = author + bibtex += author + '#' + bibtex = bibtex[:-1] + if result['num'] > 1: + ix = bibtex.rfind('#') + if result['num'] == 2: + bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] + else: + bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] + bibtex = bibtex.replace('#', ', ') + result['text'] = bibtex + + result['abbrev'] = '' + for author in result['list']: + pos = author.rfind(' ') + 1 + count = 1 + if result['num'] == 1: + count = 3 + result['abbrev'] += copychars(author, pos, count) + + return result + + +# +# data = title string +# @return the capitalized title (first letter is capitalized), rest are capitalized +# only if capitalized inside braces +# +def capitalizetitle(data): + title_list = capitalize_rex.split(data) + title = '' + count = 0 + for phrase in title_list: + check = string.lstrip(phrase) + + # keep phrase's capitalization the same + if check.find('{') == 0: + title += removebraces(phrase) + else: + # first word --> capitalize first letter (after spaces) + if count == 0: + title += check.capitalize() + else: + title += phrase.lower() + count = count + 1 + + return title + + +# +# @return the bibtex for the title +# @param data --> title string +# braces are removed from title +# +def bibtextitle(data, entrytype): + if entrytype in ('book', 'inbook'): + title = removebraces(data.strip()) + else: + title = removebraces(capitalizetitle(data.strip())) + bibtex = title + return bibtex + + +# +# function to compare entry lists +# +def entry_cmp(x, y): + return cmp(x[0], y[0]) + + +# +# print the XML for the transformed "filecont_source" +# +def bibtexdecoder(filecont_source): + filecont = [] + file = [] + + # want @{, + pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') + endtype_rex = re.compile('}\s*$') + endtag_rex = re.compile('^\s*}\s*$') + + bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') + bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') + + quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') + quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') + + for line in filecont_source: + line = line[:-1] + + # encode character entities + line = string.replace(line, '&', '&') + line = string.replace(line, '<', '<') + line = string.replace(line, '>', '>') + + # start entry: publication type (store for later use) + if pubtype_rex.match(line): + # want @{, + entrycont = {} + entry = [] + entrytype = pubtype_rex.sub('\g<1>',line) + entrytype = string.lower(entrytype) + entryid = pubtype_rex.sub('\g<2>', line) + + # end entry if just a } + elif endtype_rex.match(line): + # generate doxygen code for the entry + + # enty type related formattings + if entrytype in ('book', 'inbook'): + entrycont['title'] = '' + entrycont['title'] + '' + if not entrycont.has_key('author'): + entrycont['author'] = entrycont['editor'] + entrycont['author']['text'] += ', editors' + elif entrytype == 'article': + entrycont['journal'] = '' + entrycont['journal'] + '' + elif entrytype in ('inproceedings', 'incollection', 'conference'): + entrycont['booktitle'] = '' + entrycont['booktitle'] + '' + elif entrytype == 'techreport': + if not entrycont.has_key('type'): + entrycont['type'] = 'Technical report' + elif entrytype == 'mastersthesis': + entrycont['type'] = 'Master\'s thesis' + elif entrytype == 'phdthesis': + entrycont['type'] = 'PhD thesis' + + for eline in entrycont: + if eline != '': + eline = latexreplacements(eline) + + if entrycont.has_key('pages') and (entrycont['pages'] != ''): + entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') + + if entrycont.has_key('author') and (entrycont['author'] != ''): + entry.append(entrycont['author']['text'] + '.') + if entrycont.has_key('title') and (entrycont['title'] != ''): + entry.append(entrycont['title'] + '.') + if entrycont.has_key('journal') and (entrycont['journal'] != ''): + entry.append(entrycont['journal'] + ',') + if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): + entry.append('In ' + entrycont['booktitle'] + ',') + if entrycont.has_key('type') and (entrycont['type'] != ''): + eline = entrycont['type'] + if entrycont.has_key('number') and (entrycont['number'] != ''): + eline += ' ' + entrycont['number'] + eline += ',' + entry.append(eline) + if entrycont.has_key('institution') and (entrycont['institution'] != ''): + entry.append(entrycont['institution'] + ',') + if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): + entry.append(entrycont['publisher'] + ',') + if entrycont.has_key('school') and (entrycont['school'] != ''): + entry.append(entrycont['school'] + ',') + if entrycont.has_key('address') and (entrycont['address'] != ''): + entry.append(entrycont['address'] + ',') + if entrycont.has_key('edition') and (entrycont['edition'] != ''): + entry.append(entrycont['edition'] + ' edition,') + if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): + entry.append(entrycont['howpublished'] + ',') + if entrycont.has_key('volume') and (entrycont['volume'] != ''): + eline = entrycont['volume']; + if entrycont.has_key('number') and (entrycont['number'] != ''): + eline += '(' + entrycont['number'] + ')' + if entrycont.has_key('pages') and (entrycont['pages'] != ''): + eline += ':' + entrycont['pages'] + eline += ',' + entry.append(eline) + else: + if entrycont.has_key('pages') and (entrycont['pages'] != ''): + entry.append('pages ' + entrycont['pages'] + ',') + if entrycont.has_key('year') and (entrycont['year'] != ''): + if entrycont.has_key('month') and (entrycont['month'] != ''): + entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') + else: + entry.append(entrycont['year'] + '.') + if entrycont.has_key('note') and (entrycont['note'] != ''): + entry.append(entrycont['note'] + '.') + if entrycont.has_key('url') and (entrycont['url'] != ''): + entry.append(entrycont['url'] + '.') + + # generate keys for sorting and for the output + sortkey = '' + bibkey = '' + if entrycont.has_key('author'): + for author in entrycont['author']['list']: + sortkey += copychars(author, author.rfind(' ')+1, len(author)) + bibkey = entrycont['author']['abbrev'] + else: + bibkey = 'x' + if entrycont.has_key('year'): + sortkey += entrycont['year'] + bibkey += entrycont['year'][-2:] + if entrycont.has_key('title'): + sortkey += entrycont['title'] + if entrycont.has_key('key'): + sortkey = entrycont['key'] + sortkey + bibkey = entrycont['key'] + entry.insert(0, sortkey) + entry.insert(1, bibkey) + entry.insert(2, entryid) + + # add the entry to the file contents + filecont.append(entry) + + else: + # field, publication info + field = '' + data = '' + + # field = {data} entries + if bracedata_rex.match(line): + field = bracefield_rex.sub('\g<1>', line) + field = string.lower(field) + data = bracedata_rex.sub('\g<2>', line) + + # field = "data" entries + elif quotedata_rex.match(line): + field = quotefield_rex.sub('\g<1>', line) + field = string.lower(field) + data = quotedata_rex.sub('\g<2>', line) + + # field = data entries + elif data_rex.match(line): + field = field_rex.sub('\g<1>', line) + field = string.lower(field) + data = data_rex.sub('\g<2>', line) + + if field == 'url': + data = '\\url{' + data.strip() + '}' + + if field in ('author', 'editor'): + entrycont[field] = bibtexauthor(data) + line = '' + elif field == 'title': + line = bibtextitle(data, entrytype) + elif field != '': + line = removebraces(transformurls(data.strip())) + + if line != '': + line = latexreplacements(line) + entrycont[field] = line + + + # sort entries + filecont.sort(entry_cmp) + + # count the bibtex keys + keytable = {} + counttable = {} + for entry in filecont: + bibkey = entry[1] + if not keytable.has_key(bibkey): + keytable[bibkey] = 1 + else: + keytable[bibkey] += 1 + + for bibkey in keytable.keys(): + counttable[bibkey] = 0 + + # generate output + for entry in filecont: + # generate output key form the bibtex key + bibkey = entry[1] + entryid = entry[2] + if keytable[bibkey] == 1: + outkey = bibkey + else: + outkey = bibkey + chr(97 + counttable[bibkey]) + counttable[bibkey] += 1 + + # append the entry code to the output + file.append('\\section ' + entryid + ' [' + outkey + ']') + file.append('
') + for line in entry[3:]: + file.append(line) + file.append('
') + file.append('') + + return file + + +# +# return 1 iff abbr is in line but not inside braces or quotes +# assumes that abbr appears only once on the line (out of braces and quotes) +# +def verify_out_of_braces(line, abbr): + + phrase_split = delimiter_rex.split(line) + + abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) + + open_brace = 0 + open_quote = 0 + + for phrase in phrase_split: + if phrase == "{": + open_brace = open_brace + 1 + elif phrase == "}": + open_brace = open_brace - 1 + elif phrase == '"': + if open_quote == 1: + open_quote = 0 + else: + open_quote = 1 + elif abbr_rex.search(phrase): + if open_brace == 0 and open_quote == 0: + return 1 + + return 0 + + +# +# a line in the form phrase1 # phrase2 # ... # phrasen +# is returned as phrase1 phrase2 ... phrasen +# with the correct punctuation +# Bug: Doesn't always work with multiple abbreviations plugged in +# +def concat_line(line): + # only look at part after equals + field = field_rex.sub('\g<1>',line) + rest = field_rex.sub('\g<2>',line) + + concat_line = field + ' =' + + pound_split = concatsplit_rex.split(rest) + + phrase_count = 0 + length = len(pound_split) + + for phrase in pound_split: + phrase = phrase.strip() + if phrase_count != 0: + if phrase.startswith('"') or phrase.startswith('{'): + phrase = phrase[1:] + elif phrase.startswith('"'): + phrase = phrase.replace('"','{',1) + + if phrase_count != length-1: + if phrase.endswith('"') or phrase.endswith('}'): + phrase = phrase[:-1] + else: + if phrase.endswith('"'): + phrase = phrase[:-1] + phrase = phrase + "}" + elif phrase.endswith('",'): + phrase = phrase[:-2] + phrase = phrase + "}," + + # if phrase did have \#, add the \# back + if phrase.endswith('\\'): + phrase = phrase + "#" + concat_line = concat_line + ' ' + phrase + + phrase_count = phrase_count + 1 + + return concat_line + + +# +# substitute abbreviations into filecont +# @param filecont_source - string of data from file +# +def bibtex_replace_abbreviations(filecont_source): + filecont = filecont_source.splitlines() + + # These are defined in bibtex, so we'll define them too + abbr_list = ['jan','feb','mar','apr','may','jun', + 'jul','aug','sep','oct','nov','dec'] + value_list = ['January','February','March','April', + 'May','June','July','August','September', + 'October','November','December'] + + abbr_rex = [] + total_abbr_count = 0 + + front = '\\b' + back = '(,?)\\b' + + for x in abbr_list: + abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) + total_abbr_count = total_abbr_count + 1 + + + abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', + re.I) + + comment_rex = re.compile('@comment\s*{',re.I) + preamble_rex = re.compile('@preamble\s*{',re.I) + + waiting_for_end_string = 0 + i = 0 + filecont2 = '' + + for line in filecont: + if line == ' ' or line == '': + continue + + if waiting_for_end_string: + if re.search('}',line): + waiting_for_end_string = 0 + continue + + if abbrdef_rex.search(line): + abbr = abbrdef_rex.sub('\g<1>', line) + + if abbr_list.count(abbr) == 0: + val = abbrdef_rex.sub('\g<2>', line) + abbr_list.append(abbr) + value_list.append(string.strip(val)) + abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) + total_abbr_count = total_abbr_count + 1 + waiting_for_end_string = 1 + continue + + if comment_rex.search(line): + waiting_for_end_string = 1 + continue + + if preamble_rex.search(line): + waiting_for_end_string = 1 + continue + + + # replace subsequent abbreviations with the value + abbr_count = 0 + + for x in abbr_list: + + if abbr_rex[abbr_count].search(line): + if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: + line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) + # Check for # concatenations + if concatsplit_rex.search(line): + line = concat_line(line) + abbr_count = abbr_count + 1 + + + filecont2 = filecont2 + line + '\n' + i = i+1 + + + # Do one final pass over file + + # make sure that didn't end up with {" or }" after the substitution + filecont2 = filecont2.replace('{"','{{') + filecont2 = filecont2.replace('"}','}}') + + afterquotevalue_rex = re.compile('"\s*,\s*') + afterbrace_rex = re.compile('"\s*}') + afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') + + # add new lines to data that changed because of abbreviation substitutions + filecont2 = afterquotevalue_rex.sub('",\n', filecont2) + filecont2 = afterbrace_rex.sub('"\n}', filecont2) + filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) + + return filecont2 + +# +# convert @type( ... ) to @type{ ... } +# +def no_outer_parens(filecont): + + # do checking for open parens + # will convert to braces + paren_split = re.split('([(){}])',filecont) + + open_paren_count = 0 + open_type = 0 + look_next = 0 + + # rebuild filecont + filecont = '' + + at_rex = re.compile('@\w*') + + for phrase in paren_split: + if look_next == 1: + if phrase == '(': + phrase = '{' + open_paren_count = open_paren_count + 1 + else: + open_type = 0 + look_next = 0 + + if phrase == '(': + open_paren_count = open_paren_count + 1 + + elif phrase == ')': + open_paren_count = open_paren_count - 1 + if open_type == 1 and open_paren_count == 0: + phrase = '}' + open_type = 0 + + elif at_rex.search( phrase ): + open_type = 1 + look_next = 1 + + filecont = filecont + phrase + + return filecont + + +# +# make all whitespace into just one space +# format the bibtex file into a usable form. +# +def bibtexwasher(filecont_source): + + space_rex = re.compile('\s+') + comment_rex = re.compile('\s*%') + + filecont = [] + + # remove trailing and excessive whitespace + # ignore comments + for line in filecont_source: + line = string.strip(line) + line = space_rex.sub(' ', line) + # ignore comments + if not comment_rex.match(line) and line != '': + filecont.append(' '+ line) + + filecont = string.join(filecont, '') + + # the file is in one long string + + filecont = no_outer_parens(filecont) + + # + # split lines according to preferred syntax scheme + # + filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) + + # add new lines after commas that are after values + filecont = re.sub('"\s*,', '",\n', filecont) + filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) + filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', + '\n\n\g<1>\g<2>,\n', filecont) + + # add new lines after } + filecont = re.sub('"\s*}','"\n}\n', filecont) + filecont = re.sub('}\s*,','},\n', filecont) + + + filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) + + # character encoding, reserved latex characters + filecont = re.sub('{\\\&}', '&', filecont) + filecont = re.sub('\\\&', '&', filecont) + + # do checking for open braces to get format correct + open_brace_count = 0 + brace_split = re.split('([{}])',filecont) + + # rebuild filecont + filecont = '' + + for phrase in brace_split: + if phrase == '{': + open_brace_count = open_brace_count + 1 + elif phrase == '}': + open_brace_count = open_brace_count - 1 + if open_brace_count == 0: + filecont = filecont + '\n' + + filecont = filecont + phrase + + filecont2 = bibtex_replace_abbreviations(filecont) + + # gather + filecont = filecont2.splitlines() + i=0 + j=0 # count the number of blank lines + for line in filecont: + # ignore blank lines + if line == '' or line == ' ': + j = j+1 + continue + filecont[i] = line + '\n' + i = i+1 + + # get rid of the extra stuff at the end of the array + # (The extra stuff are duplicates that are in the array because + # blank lines were removed.) + length = len( filecont) + filecont[length-j:length] = [] + + return filecont + + +def filehandler(filepath): + try: + fd = open(filepath, 'r') + filecont_source = fd.readlines() + fd.close() + except: + print 'Could not open file:', filepath + washeddata = bibtexwasher(filecont_source) + outdata = bibtexdecoder(washeddata) + print '/**' + print '\page references References' + print + for line in outdata: + print line + print '*/' + + +# main program + +def main(): + import sys + if sys.argv[1:]: + filepath = sys.argv[1] + else: + print "No input file" + sys.exit() + filehandler(filepath) + +if __name__ == "__main__": main() + + +# end python script