kpeter@836: #! /usr/bin/env python kpeter@743: """ kpeter@743: BibTeX to Doxygen converter kpeter@743: Usage: python bib2dox.py bibfile.bib > bibfile.dox kpeter@743: kpeter@836: This file is a part of LEMON, a generic C++ optimization library. kpeter@836: kpeter@836: ********************************************************************** kpeter@836: kpeter@743: This code is the modification of the BibTeX to XML converter kpeter@836: by Vidar Bronken Gundersen et al. kpeter@836: See the original copyright notices below. kpeter@743: kpeter@743: ********************************************************************** kpeter@743: kpeter@743: Decoder for bibliographic data, BibTeX kpeter@743: Usage: python bibtex2xml.py bibfile.bib > bibfile.xml kpeter@743: kpeter@743: v.8 kpeter@743: (c)2002-06-23 Vidar Bronken Gundersen kpeter@743: http://bibtexml.sf.net/ kpeter@743: Reuse approved as long as this notification is kept. kpeter@743: Licence: GPL. kpeter@743: kpeter@743: Contributions/thanks to: kpeter@743: Egon Willighagen, http://sf.net/projects/jreferences/ kpeter@743: Richard Mahoney (for providing a test case) kpeter@743: kpeter@743: Editted by Sara Sprenkle to be more robust and handle more bibtex features. kpeter@743: (c) 2003-01-15 kpeter@743: kpeter@743: 1. Changed bibtex: tags to bibxml: tags. kpeter@743: 2. Use xmlns:bibxml="http://bibtexml.sf.net/" kpeter@743: 3. Allow spaces between @type and first { kpeter@743: 4. "author" fields with multiple authors split by " and " kpeter@743: are put in separate xml "bibxml:author" tags. kpeter@743: 5. Option for Titles: words are capitalized kpeter@743: only if first letter in title or capitalized inside braces kpeter@743: 6. Removes braces from within field values kpeter@743: 7. Ignores comments in bibtex file (including @comment{ or % ) kpeter@743: 8. Replaces some special latex tags, e.g., replaces ~ with ' ' kpeter@743: 9. Handles bibtex @string abbreviations kpeter@743: --> includes bibtex's default abbreviations for months kpeter@743: --> does concatenation of abbr # " more " and " more " # abbr kpeter@743: 10. Handles @type( ... ) or @type{ ... } kpeter@743: 11. The keywords field is split on , or ; and put into separate xml kpeter@743: "bibxml:keywords" tags kpeter@743: 12. Ignores @preamble kpeter@743: kpeter@743: Known Limitations kpeter@743: 1. Does not transform Latex encoding like math mode and special kpeter@743: latex symbols. kpeter@743: 2. Does not parse author fields into first and last names. kpeter@743: E.g., It does not do anything special to an author whose name is kpeter@743: in the form LAST_NAME, FIRST_NAME kpeter@743: In "author" tag, will show up as kpeter@743: LAST_NAME, FIRST_NAME kpeter@743: 3. Does not handle "crossref" fields other than to print kpeter@743: ... kpeter@743: 4. Does not inform user of the input's format errors. You just won't kpeter@743: be able to transform the file later with XSL kpeter@743: kpeter@743: You will have to manually edit the XML output if you need to handle kpeter@743: these (and unknown) limitations. kpeter@743: kpeter@743: """ kpeter@743: kpeter@743: import string, re kpeter@743: kpeter@743: # set of valid name characters kpeter@743: valid_name_chars = '[\w\-:]' kpeter@743: kpeter@743: # kpeter@743: # define global regular expression variables kpeter@743: # kpeter@743: author_rex = re.compile('\s+and\s+') kpeter@743: rembraces_rex = re.compile('[{}]') kpeter@754: capitalize_rex = re.compile('({[^}]*})') kpeter@743: kpeter@743: # used by bibtexkeywords(data) kpeter@743: keywords_rex = re.compile('[,;]') kpeter@743: kpeter@743: # used by concat_line(line) kpeter@743: concatsplit_rex = re.compile('\s*#\s*') kpeter@743: kpeter@743: # split on {, }, or " in verify_out_of_braces kpeter@743: delimiter_rex = re.compile('([{}"])',re.I) kpeter@743: kpeter@743: field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') kpeter@743: data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') kpeter@743: kpeter@743: url_rex = re.compile('\\\url\{([^}]*)\}') kpeter@743: kpeter@745: # kpeter@745: # styles for html formatting kpeter@745: # kpeter@745: divstyle = 'margin-top: -4ex; margin-left: 8em;' kpeter@743: kpeter@743: # kpeter@743: # return the string parameter without braces kpeter@743: # kpeter@743: def transformurls(str): kpeter@743: return url_rex.sub(r'\1', str) kpeter@743: kpeter@743: # kpeter@743: # return the string parameter without braces kpeter@743: # kpeter@743: def removebraces(str): kpeter@743: return rembraces_rex.sub('', str) kpeter@743: kpeter@743: # kpeter@743: # latex-specific replacements kpeter@743: # (do this after braces were removed) kpeter@743: # kpeter@743: def latexreplacements(line): kpeter@743: line = string.replace(line, '~', ' ') kpeter@743: line = string.replace(line, '\\\'a', 'á') kpeter@743: line = string.replace(line, '\\"a', 'ä') kpeter@743: line = string.replace(line, '\\\'e', 'é') kpeter@743: line = string.replace(line, '\\"e', 'ë') kpeter@743: line = string.replace(line, '\\\'i', 'í') kpeter@743: line = string.replace(line, '\\"i', 'ï') kpeter@743: line = string.replace(line, '\\\'o', 'ó') kpeter@743: line = string.replace(line, '\\"o', 'ö') kpeter@743: line = string.replace(line, '\\\'u', 'ú') kpeter@743: line = string.replace(line, '\\"u', 'ü') kpeter@743: line = string.replace(line, '\\H o', 'õ') kpeter@743: line = string.replace(line, '\\H u', 'ü') # ũ does not exist kpeter@743: line = string.replace(line, '\\\'A', 'Á') kpeter@743: line = string.replace(line, '\\"A', 'Ä') kpeter@743: line = string.replace(line, '\\\'E', 'É') kpeter@743: line = string.replace(line, '\\"E', 'Ë') kpeter@743: line = string.replace(line, '\\\'I', 'Í') kpeter@743: line = string.replace(line, '\\"I', 'Ï') kpeter@743: line = string.replace(line, '\\\'O', 'Ó') kpeter@743: line = string.replace(line, '\\"O', 'Ö') kpeter@743: line = string.replace(line, '\\\'U', 'Ú') kpeter@743: line = string.replace(line, '\\"U', 'Ü') kpeter@743: line = string.replace(line, '\\H O', 'Õ') kpeter@743: line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist kpeter@743: kpeter@743: return line kpeter@743: kpeter@743: # kpeter@743: # copy characters form a string decoding html expressions (&xyz;) kpeter@743: # kpeter@743: def copychars(str, ifrom, count): kpeter@743: result = '' kpeter@743: i = ifrom kpeter@743: c = 0 kpeter@743: html_spec = False kpeter@743: while (i < len(str)) and (c < count): kpeter@743: if str[i] == '&': kpeter@743: html_spec = True; kpeter@743: if i+1 < len(str): kpeter@743: result += str[i+1] kpeter@743: c += 1 kpeter@743: i += 2 kpeter@743: else: kpeter@743: if not html_spec: kpeter@743: if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ kpeter@743: ((str[i] >= 'a') and (str[i] <= 'z')): kpeter@743: result += str[i] kpeter@743: c += 1 kpeter@743: elif str[i] == ';': kpeter@743: html_spec = False; kpeter@743: i += 1 kpeter@743: kpeter@743: return result kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # Handle a list of authors (separated by 'and'). kpeter@743: # It gives back an array of the follwing values: kpeter@743: # - num: the number of authors, kpeter@743: # - list: the list of the author names, kpeter@743: # - text: the bibtex text (separated by commas and/or 'and') kpeter@743: # - abbrev: abbreviation that can be used for indicate the kpeter@743: # bibliography entries kpeter@743: # kpeter@743: def bibtexauthor(data): kpeter@743: result = {} kpeter@743: bibtex = '' kpeter@743: result['list'] = author_rex.split(data) kpeter@743: result['num'] = len(result['list']) kpeter@743: for i, author in enumerate(result['list']): kpeter@743: # general transformations kpeter@743: author = latexreplacements(removebraces(author.strip())) kpeter@743: # transform "Xyz, A. B." to "A. B. Xyz" kpeter@743: pos = author.find(',') kpeter@743: if pos != -1: kpeter@743: author = author[pos+1:].strip() + ' ' + author[:pos].strip() kpeter@743: result['list'][i] = author kpeter@743: bibtex += author + '#' kpeter@743: bibtex = bibtex[:-1] kpeter@743: if result['num'] > 1: kpeter@743: ix = bibtex.rfind('#') kpeter@743: if result['num'] == 2: kpeter@743: bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] kpeter@743: else: kpeter@743: bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] kpeter@743: bibtex = bibtex.replace('#', ', ') kpeter@743: result['text'] = bibtex kpeter@743: kpeter@743: result['abbrev'] = '' kpeter@743: for author in result['list']: kpeter@743: pos = author.rfind(' ') + 1 kpeter@743: count = 1 kpeter@743: if result['num'] == 1: kpeter@743: count = 3 kpeter@743: result['abbrev'] += copychars(author, pos, count) kpeter@743: kpeter@743: return result kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # data = title string kpeter@743: # @return the capitalized title (first letter is capitalized), rest are capitalized kpeter@743: # only if capitalized inside braces kpeter@743: # kpeter@743: def capitalizetitle(data): kpeter@743: title_list = capitalize_rex.split(data) kpeter@743: title = '' kpeter@743: count = 0 kpeter@743: for phrase in title_list: kpeter@743: check = string.lstrip(phrase) kpeter@743: kpeter@743: # keep phrase's capitalization the same kpeter@743: if check.find('{') == 0: kpeter@743: title += removebraces(phrase) kpeter@743: else: kpeter@743: # first word --> capitalize first letter (after spaces) kpeter@743: if count == 0: kpeter@743: title += check.capitalize() kpeter@743: else: kpeter@743: title += phrase.lower() kpeter@743: count = count + 1 kpeter@743: kpeter@743: return title kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # @return the bibtex for the title kpeter@743: # @param data --> title string kpeter@743: # braces are removed from title kpeter@743: # kpeter@743: def bibtextitle(data, entrytype): kpeter@743: if entrytype in ('book', 'inbook'): kpeter@743: title = removebraces(data.strip()) kpeter@743: else: kpeter@743: title = removebraces(capitalizetitle(data.strip())) kpeter@743: bibtex = title kpeter@743: return bibtex kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # function to compare entry lists kpeter@743: # kpeter@743: def entry_cmp(x, y): kpeter@743: return cmp(x[0], y[0]) kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # print the XML for the transformed "filecont_source" kpeter@743: # kpeter@743: def bibtexdecoder(filecont_source): kpeter@743: filecont = [] kpeter@743: file = [] kpeter@743: kpeter@743: # want @{, kpeter@743: pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') kpeter@743: endtype_rex = re.compile('}\s*$') kpeter@743: endtag_rex = re.compile('^\s*}\s*$') kpeter@743: kpeter@743: bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') kpeter@743: bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') kpeter@743: kpeter@743: quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') kpeter@743: quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') kpeter@743: kpeter@743: for line in filecont_source: kpeter@743: line = line[:-1] kpeter@743: kpeter@743: # encode character entities kpeter@743: line = string.replace(line, '&', '&') kpeter@743: line = string.replace(line, '<', '<') kpeter@743: line = string.replace(line, '>', '>') kpeter@743: kpeter@743: # start entry: publication type (store for later use) kpeter@743: if pubtype_rex.match(line): kpeter@743: # want @{, kpeter@743: entrycont = {} kpeter@743: entry = [] kpeter@743: entrytype = pubtype_rex.sub('\g<1>',line) kpeter@743: entrytype = string.lower(entrytype) kpeter@745: entryid = pubtype_rex.sub('\g<2>', line) kpeter@743: kpeter@743: # end entry if just a } kpeter@743: elif endtype_rex.match(line): kpeter@743: # generate doxygen code for the entry kpeter@743: kpeter@743: # enty type related formattings kpeter@743: if entrytype in ('book', 'inbook'): kpeter@743: entrycont['title'] = '' + entrycont['title'] + '' kpeter@743: if not entrycont.has_key('author'): kpeter@743: entrycont['author'] = entrycont['editor'] kpeter@743: entrycont['author']['text'] += ', editors' kpeter@743: elif entrytype == 'article': kpeter@743: entrycont['journal'] = '' + entrycont['journal'] + '' kpeter@743: elif entrytype in ('inproceedings', 'incollection', 'conference'): kpeter@743: entrycont['booktitle'] = '' + entrycont['booktitle'] + '' kpeter@743: elif entrytype == 'techreport': kpeter@743: if not entrycont.has_key('type'): kpeter@743: entrycont['type'] = 'Technical report' kpeter@743: elif entrytype == 'mastersthesis': kpeter@743: entrycont['type'] = 'Master\'s thesis' kpeter@743: elif entrytype == 'phdthesis': kpeter@743: entrycont['type'] = 'PhD thesis' kpeter@743: kpeter@743: for eline in entrycont: kpeter@743: if eline != '': kpeter@743: eline = latexreplacements(eline) kpeter@743: kpeter@743: if entrycont.has_key('pages') and (entrycont['pages'] != ''): kpeter@743: entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') kpeter@743: kpeter@743: if entrycont.has_key('author') and (entrycont['author'] != ''): kpeter@743: entry.append(entrycont['author']['text'] + '.') kpeter@743: if entrycont.has_key('title') and (entrycont['title'] != ''): kpeter@743: entry.append(entrycont['title'] + '.') kpeter@743: if entrycont.has_key('journal') and (entrycont['journal'] != ''): kpeter@743: entry.append(entrycont['journal'] + ',') kpeter@743: if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): kpeter@743: entry.append('In ' + entrycont['booktitle'] + ',') kpeter@743: if entrycont.has_key('type') and (entrycont['type'] != ''): kpeter@743: eline = entrycont['type'] kpeter@743: if entrycont.has_key('number') and (entrycont['number'] != ''): kpeter@743: eline += ' ' + entrycont['number'] kpeter@743: eline += ',' kpeter@743: entry.append(eline) kpeter@743: if entrycont.has_key('institution') and (entrycont['institution'] != ''): kpeter@743: entry.append(entrycont['institution'] + ',') kpeter@743: if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): kpeter@743: entry.append(entrycont['publisher'] + ',') kpeter@743: if entrycont.has_key('school') and (entrycont['school'] != ''): kpeter@743: entry.append(entrycont['school'] + ',') kpeter@743: if entrycont.has_key('address') and (entrycont['address'] != ''): kpeter@743: entry.append(entrycont['address'] + ',') kpeter@743: if entrycont.has_key('edition') and (entrycont['edition'] != ''): kpeter@743: entry.append(entrycont['edition'] + ' edition,') kpeter@743: if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): kpeter@743: entry.append(entrycont['howpublished'] + ',') kpeter@743: if entrycont.has_key('volume') and (entrycont['volume'] != ''): kpeter@743: eline = entrycont['volume']; kpeter@743: if entrycont.has_key('number') and (entrycont['number'] != ''): kpeter@743: eline += '(' + entrycont['number'] + ')' kpeter@743: if entrycont.has_key('pages') and (entrycont['pages'] != ''): kpeter@743: eline += ':' + entrycont['pages'] kpeter@743: eline += ',' kpeter@743: entry.append(eline) kpeter@743: else: kpeter@743: if entrycont.has_key('pages') and (entrycont['pages'] != ''): kpeter@743: entry.append('pages ' + entrycont['pages'] + ',') kpeter@743: if entrycont.has_key('year') and (entrycont['year'] != ''): kpeter@743: if entrycont.has_key('month') and (entrycont['month'] != ''): kpeter@743: entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') kpeter@743: else: kpeter@743: entry.append(entrycont['year'] + '.') kpeter@743: if entrycont.has_key('note') and (entrycont['note'] != ''): kpeter@743: entry.append(entrycont['note'] + '.') kpeter@754: if entrycont.has_key('url') and (entrycont['url'] != ''): kpeter@754: entry.append(entrycont['url'] + '.') kpeter@743: kpeter@743: # generate keys for sorting and for the output kpeter@743: sortkey = '' kpeter@743: bibkey = '' kpeter@743: if entrycont.has_key('author'): kpeter@743: for author in entrycont['author']['list']: kpeter@743: sortkey += copychars(author, author.rfind(' ')+1, len(author)) kpeter@743: bibkey = entrycont['author']['abbrev'] kpeter@743: else: kpeter@743: bibkey = 'x' kpeter@743: if entrycont.has_key('year'): kpeter@743: sortkey += entrycont['year'] kpeter@743: bibkey += entrycont['year'][-2:] kpeter@743: if entrycont.has_key('title'): kpeter@743: sortkey += entrycont['title'] kpeter@743: if entrycont.has_key('key'): kpeter@743: sortkey = entrycont['key'] + sortkey kpeter@743: bibkey = entrycont['key'] kpeter@743: entry.insert(0, sortkey) kpeter@743: entry.insert(1, bibkey) kpeter@745: entry.insert(2, entryid) kpeter@743: kpeter@743: # add the entry to the file contents kpeter@743: filecont.append(entry) kpeter@743: kpeter@743: else: kpeter@743: # field, publication info kpeter@743: field = '' kpeter@743: data = '' kpeter@743: kpeter@743: # field = {data} entries kpeter@743: if bracedata_rex.match(line): kpeter@743: field = bracefield_rex.sub('\g<1>', line) kpeter@743: field = string.lower(field) kpeter@743: data = bracedata_rex.sub('\g<2>', line) kpeter@743: kpeter@743: # field = "data" entries kpeter@743: elif quotedata_rex.match(line): kpeter@743: field = quotefield_rex.sub('\g<1>', line) kpeter@743: field = string.lower(field) kpeter@743: data = quotedata_rex.sub('\g<2>', line) kpeter@743: kpeter@743: # field = data entries kpeter@743: elif data_rex.match(line): kpeter@743: field = field_rex.sub('\g<1>', line) kpeter@743: field = string.lower(field) kpeter@743: data = data_rex.sub('\g<2>', line) kpeter@754: kpeter@754: if field == 'url': kpeter@754: data = '\\url{' + data.strip() + '}' kpeter@743: kpeter@743: if field in ('author', 'editor'): kpeter@743: entrycont[field] = bibtexauthor(data) kpeter@743: line = '' kpeter@743: elif field == 'title': kpeter@743: line = bibtextitle(data, entrytype) kpeter@743: elif field != '': kpeter@743: line = removebraces(transformurls(data.strip())) kpeter@743: kpeter@743: if line != '': kpeter@743: line = latexreplacements(line) kpeter@743: entrycont[field] = line kpeter@743: kpeter@743: kpeter@743: # sort entries kpeter@743: filecont.sort(entry_cmp) kpeter@743: kpeter@743: # count the bibtex keys kpeter@743: keytable = {} kpeter@743: counttable = {} kpeter@743: for entry in filecont: kpeter@743: bibkey = entry[1] kpeter@743: if not keytable.has_key(bibkey): kpeter@743: keytable[bibkey] = 1 kpeter@743: else: kpeter@743: keytable[bibkey] += 1 kpeter@743: kpeter@743: for bibkey in keytable.keys(): kpeter@743: counttable[bibkey] = 0 kpeter@743: kpeter@743: # generate output kpeter@743: for entry in filecont: kpeter@743: # generate output key form the bibtex key kpeter@743: bibkey = entry[1] kpeter@745: entryid = entry[2] kpeter@743: if keytable[bibkey] == 1: kpeter@743: outkey = bibkey kpeter@743: else: kpeter@743: outkey = bibkey + chr(97 + counttable[bibkey]) kpeter@743: counttable[bibkey] += 1 kpeter@743: kpeter@743: # append the entry code to the output kpeter@745: file.append('\\section ' + entryid + ' [' + outkey + ']') kpeter@745: file.append('
') kpeter@745: for line in entry[3:]: kpeter@743: file.append(line) kpeter@745: file.append('
') kpeter@743: file.append('') kpeter@743: kpeter@743: return file kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # return 1 iff abbr is in line but not inside braces or quotes kpeter@743: # assumes that abbr appears only once on the line (out of braces and quotes) kpeter@743: # kpeter@743: def verify_out_of_braces(line, abbr): kpeter@743: kpeter@743: phrase_split = delimiter_rex.split(line) kpeter@743: kpeter@743: abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) kpeter@743: kpeter@743: open_brace = 0 kpeter@743: open_quote = 0 kpeter@743: kpeter@743: for phrase in phrase_split: kpeter@743: if phrase == "{": kpeter@743: open_brace = open_brace + 1 kpeter@743: elif phrase == "}": kpeter@743: open_brace = open_brace - 1 kpeter@743: elif phrase == '"': kpeter@743: if open_quote == 1: kpeter@743: open_quote = 0 kpeter@743: else: kpeter@743: open_quote = 1 kpeter@743: elif abbr_rex.search(phrase): kpeter@743: if open_brace == 0 and open_quote == 0: kpeter@743: return 1 kpeter@743: kpeter@743: return 0 kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # a line in the form phrase1 # phrase2 # ... # phrasen kpeter@743: # is returned as phrase1 phrase2 ... phrasen kpeter@743: # with the correct punctuation kpeter@743: # Bug: Doesn't always work with multiple abbreviations plugged in kpeter@743: # kpeter@743: def concat_line(line): kpeter@743: # only look at part after equals kpeter@743: field = field_rex.sub('\g<1>',line) kpeter@743: rest = field_rex.sub('\g<2>',line) kpeter@743: kpeter@743: concat_line = field + ' =' kpeter@743: kpeter@743: pound_split = concatsplit_rex.split(rest) kpeter@743: kpeter@743: phrase_count = 0 kpeter@743: length = len(pound_split) kpeter@743: kpeter@743: for phrase in pound_split: kpeter@743: phrase = phrase.strip() kpeter@743: if phrase_count != 0: kpeter@743: if phrase.startswith('"') or phrase.startswith('{'): kpeter@743: phrase = phrase[1:] kpeter@743: elif phrase.startswith('"'): kpeter@743: phrase = phrase.replace('"','{',1) kpeter@743: kpeter@743: if phrase_count != length-1: kpeter@743: if phrase.endswith('"') or phrase.endswith('}'): kpeter@743: phrase = phrase[:-1] kpeter@743: else: kpeter@743: if phrase.endswith('"'): kpeter@743: phrase = phrase[:-1] kpeter@743: phrase = phrase + "}" kpeter@743: elif phrase.endswith('",'): kpeter@743: phrase = phrase[:-2] kpeter@743: phrase = phrase + "}," kpeter@743: kpeter@743: # if phrase did have \#, add the \# back kpeter@743: if phrase.endswith('\\'): kpeter@743: phrase = phrase + "#" kpeter@743: concat_line = concat_line + ' ' + phrase kpeter@743: kpeter@743: phrase_count = phrase_count + 1 kpeter@743: kpeter@743: return concat_line kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # substitute abbreviations into filecont kpeter@743: # @param filecont_source - string of data from file kpeter@743: # kpeter@743: def bibtex_replace_abbreviations(filecont_source): kpeter@743: filecont = filecont_source.splitlines() kpeter@743: kpeter@743: # These are defined in bibtex, so we'll define them too kpeter@743: abbr_list = ['jan','feb','mar','apr','may','jun', kpeter@743: 'jul','aug','sep','oct','nov','dec'] kpeter@743: value_list = ['January','February','March','April', kpeter@743: 'May','June','July','August','September', kpeter@743: 'October','November','December'] kpeter@743: kpeter@743: abbr_rex = [] kpeter@743: total_abbr_count = 0 kpeter@743: kpeter@743: front = '\\b' kpeter@743: back = '(,?)\\b' kpeter@743: kpeter@743: for x in abbr_list: kpeter@743: abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) kpeter@743: total_abbr_count = total_abbr_count + 1 kpeter@743: kpeter@743: kpeter@743: abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', kpeter@743: re.I) kpeter@743: kpeter@743: comment_rex = re.compile('@comment\s*{',re.I) kpeter@743: preamble_rex = re.compile('@preamble\s*{',re.I) kpeter@743: kpeter@743: waiting_for_end_string = 0 kpeter@743: i = 0 kpeter@743: filecont2 = '' kpeter@743: kpeter@743: for line in filecont: kpeter@743: if line == ' ' or line == '': kpeter@743: continue kpeter@743: kpeter@743: if waiting_for_end_string: kpeter@743: if re.search('}',line): kpeter@743: waiting_for_end_string = 0 kpeter@743: continue kpeter@743: kpeter@743: if abbrdef_rex.search(line): kpeter@743: abbr = abbrdef_rex.sub('\g<1>', line) kpeter@743: kpeter@743: if abbr_list.count(abbr) == 0: kpeter@743: val = abbrdef_rex.sub('\g<2>', line) kpeter@743: abbr_list.append(abbr) kpeter@743: value_list.append(string.strip(val)) kpeter@743: abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) kpeter@743: total_abbr_count = total_abbr_count + 1 kpeter@743: waiting_for_end_string = 1 kpeter@743: continue kpeter@743: kpeter@743: if comment_rex.search(line): kpeter@743: waiting_for_end_string = 1 kpeter@743: continue kpeter@743: kpeter@743: if preamble_rex.search(line): kpeter@743: waiting_for_end_string = 1 kpeter@743: continue kpeter@743: kpeter@743: kpeter@743: # replace subsequent abbreviations with the value kpeter@743: abbr_count = 0 kpeter@743: kpeter@743: for x in abbr_list: kpeter@743: kpeter@743: if abbr_rex[abbr_count].search(line): kpeter@743: if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: kpeter@743: line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) kpeter@743: # Check for # concatenations kpeter@743: if concatsplit_rex.search(line): kpeter@743: line = concat_line(line) kpeter@743: abbr_count = abbr_count + 1 kpeter@743: kpeter@743: kpeter@743: filecont2 = filecont2 + line + '\n' kpeter@743: i = i+1 kpeter@743: kpeter@743: kpeter@743: # Do one final pass over file kpeter@743: kpeter@743: # make sure that didn't end up with {" or }" after the substitution kpeter@743: filecont2 = filecont2.replace('{"','{{') kpeter@743: filecont2 = filecont2.replace('"}','}}') kpeter@743: kpeter@743: afterquotevalue_rex = re.compile('"\s*,\s*') kpeter@743: afterbrace_rex = re.compile('"\s*}') kpeter@743: afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') kpeter@743: kpeter@743: # add new lines to data that changed because of abbreviation substitutions kpeter@743: filecont2 = afterquotevalue_rex.sub('",\n', filecont2) kpeter@743: filecont2 = afterbrace_rex.sub('"\n}', filecont2) kpeter@743: filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) kpeter@743: kpeter@743: return filecont2 kpeter@743: kpeter@743: # kpeter@743: # convert @type( ... ) to @type{ ... } kpeter@743: # kpeter@743: def no_outer_parens(filecont): kpeter@743: kpeter@743: # do checking for open parens kpeter@743: # will convert to braces kpeter@743: paren_split = re.split('([(){}])',filecont) kpeter@743: kpeter@743: open_paren_count = 0 kpeter@743: open_type = 0 kpeter@743: look_next = 0 kpeter@743: kpeter@743: # rebuild filecont kpeter@743: filecont = '' kpeter@743: kpeter@743: at_rex = re.compile('@\w*') kpeter@743: kpeter@743: for phrase in paren_split: kpeter@743: if look_next == 1: kpeter@743: if phrase == '(': kpeter@743: phrase = '{' kpeter@743: open_paren_count = open_paren_count + 1 kpeter@743: else: kpeter@743: open_type = 0 kpeter@743: look_next = 0 kpeter@743: kpeter@743: if phrase == '(': kpeter@743: open_paren_count = open_paren_count + 1 kpeter@743: kpeter@743: elif phrase == ')': kpeter@743: open_paren_count = open_paren_count - 1 kpeter@743: if open_type == 1 and open_paren_count == 0: kpeter@743: phrase = '}' kpeter@743: open_type = 0 kpeter@743: kpeter@743: elif at_rex.search( phrase ): kpeter@743: open_type = 1 kpeter@743: look_next = 1 kpeter@743: kpeter@743: filecont = filecont + phrase kpeter@743: kpeter@743: return filecont kpeter@743: kpeter@743: kpeter@743: # kpeter@743: # make all whitespace into just one space kpeter@743: # format the bibtex file into a usable form. kpeter@743: # kpeter@743: def bibtexwasher(filecont_source): kpeter@743: kpeter@743: space_rex = re.compile('\s+') kpeter@743: comment_rex = re.compile('\s*%') kpeter@743: kpeter@743: filecont = [] kpeter@743: kpeter@743: # remove trailing and excessive whitespace kpeter@743: # ignore comments kpeter@743: for line in filecont_source: kpeter@743: line = string.strip(line) kpeter@743: line = space_rex.sub(' ', line) kpeter@743: # ignore comments kpeter@743: if not comment_rex.match(line) and line != '': kpeter@743: filecont.append(' '+ line) kpeter@743: kpeter@743: filecont = string.join(filecont, '') kpeter@743: kpeter@743: # the file is in one long string kpeter@743: kpeter@743: filecont = no_outer_parens(filecont) kpeter@743: kpeter@743: # kpeter@743: # split lines according to preferred syntax scheme kpeter@743: # kpeter@743: filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) kpeter@743: kpeter@743: # add new lines after commas that are after values kpeter@743: filecont = re.sub('"\s*,', '",\n', filecont) kpeter@743: filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) kpeter@743: filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', kpeter@743: '\n\n\g<1>\g<2>,\n', filecont) kpeter@743: kpeter@743: # add new lines after } kpeter@743: filecont = re.sub('"\s*}','"\n}\n', filecont) kpeter@743: filecont = re.sub('}\s*,','},\n', filecont) kpeter@743: kpeter@743: kpeter@743: filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) kpeter@743: kpeter@743: # character encoding, reserved latex characters kpeter@743: filecont = re.sub('{\\\&}', '&', filecont) kpeter@743: filecont = re.sub('\\\&', '&', filecont) kpeter@743: kpeter@743: # do checking for open braces to get format correct kpeter@743: open_brace_count = 0 kpeter@743: brace_split = re.split('([{}])',filecont) kpeter@743: kpeter@743: # rebuild filecont kpeter@743: filecont = '' kpeter@743: kpeter@743: for phrase in brace_split: kpeter@743: if phrase == '{': kpeter@743: open_brace_count = open_brace_count + 1 kpeter@743: elif phrase == '}': kpeter@743: open_brace_count = open_brace_count - 1 kpeter@743: if open_brace_count == 0: kpeter@743: filecont = filecont + '\n' kpeter@743: kpeter@743: filecont = filecont + phrase kpeter@743: kpeter@743: filecont2 = bibtex_replace_abbreviations(filecont) kpeter@743: kpeter@743: # gather kpeter@743: filecont = filecont2.splitlines() kpeter@743: i=0 kpeter@743: j=0 # count the number of blank lines kpeter@743: for line in filecont: kpeter@743: # ignore blank lines kpeter@743: if line == '' or line == ' ': kpeter@743: j = j+1 kpeter@743: continue kpeter@743: filecont[i] = line + '\n' kpeter@743: i = i+1 kpeter@743: kpeter@743: # get rid of the extra stuff at the end of the array kpeter@743: # (The extra stuff are duplicates that are in the array because kpeter@743: # blank lines were removed.) kpeter@743: length = len( filecont) kpeter@743: filecont[length-j:length] = [] kpeter@743: kpeter@743: return filecont kpeter@743: kpeter@743: kpeter@743: def filehandler(filepath): kpeter@743: try: kpeter@743: fd = open(filepath, 'r') kpeter@743: filecont_source = fd.readlines() kpeter@743: fd.close() kpeter@743: except: kpeter@743: print 'Could not open file:', filepath kpeter@743: washeddata = bibtexwasher(filecont_source) kpeter@743: outdata = bibtexdecoder(washeddata) kpeter@743: print '/**' kpeter@743: print '\page references References' kpeter@743: print kpeter@743: for line in outdata: kpeter@743: print line kpeter@743: print '*/' kpeter@743: kpeter@743: kpeter@743: # main program kpeter@743: kpeter@743: def main(): kpeter@743: import sys kpeter@743: if sys.argv[1:]: kpeter@743: filepath = sys.argv[1] kpeter@743: else: kpeter@743: print "No input file" kpeter@743: sys.exit() kpeter@743: filehandler(filepath) kpeter@743: kpeter@743: if __name__ == "__main__": main() kpeter@743: kpeter@743: kpeter@743: # end python script