scripts/bib2dox.py
changeset 1402 3c00344f49c9
parent 1401 cd72eae05bdf
parent 1400 6b79d93e812f
child 1403 e5af35e6c93f
child 1404 c8d0179a32a2
child 1416 f179aa1045a4
     1.1 --- a/scripts/bib2dox.py	Mon Jul 16 16:21:40 2018 +0200
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,816 +0,0 @@
     1.4 -#! /usr/bin/env python
     1.5 -"""
     1.6 -  BibTeX to Doxygen converter
     1.7 -  Usage: python bib2dox.py bibfile.bib > bibfile.dox
     1.8 -
     1.9 -  This file is a part of LEMON, a generic C++ optimization library.
    1.10 -
    1.11 -  **********************************************************************
    1.12 -
    1.13 -  This code is the modification of the BibTeX to XML converter
    1.14 -  by Vidar Bronken Gundersen et al.
    1.15 -  See the original copyright notices below. 
    1.16 -
    1.17 -  **********************************************************************
    1.18 -
    1.19 -  Decoder for bibliographic data, BibTeX
    1.20 -  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
    1.21 -
    1.22 -  v.8
    1.23 -  (c)2002-06-23 Vidar Bronken Gundersen
    1.24 -  http://bibtexml.sf.net/
    1.25 -  Reuse approved as long as this notification is kept.
    1.26 -  Licence: GPL.
    1.27 -
    1.28 -  Contributions/thanks to:
    1.29 -  Egon Willighagen, http://sf.net/projects/jreferences/
    1.30 -  Richard Mahoney (for providing a test case)
    1.31 -
    1.32 -  Editted by Sara Sprenkle to be more robust and handle more bibtex features.
    1.33 -  (c) 2003-01-15
    1.34 -
    1.35 -  1.  Changed bibtex: tags to bibxml: tags.
    1.36 -  2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
    1.37 -  3.  Allow spaces between @type and first {
    1.38 -  4.  "author" fields with multiple authors split by " and "
    1.39 -      are put in separate xml "bibxml:author" tags.
    1.40 -  5.  Option for Titles: words are capitalized
    1.41 -      only if first letter in title or capitalized inside braces
    1.42 -  6.  Removes braces from within field values
    1.43 -  7.  Ignores comments in bibtex file (including @comment{ or % )
    1.44 -  8.  Replaces some special latex tags, e.g., replaces ~ with ' '
    1.45 -  9.  Handles bibtex @string abbreviations
    1.46 -        --> includes bibtex's default abbreviations for months
    1.47 -        --> does concatenation of abbr # " more " and " more " # abbr
    1.48 -  10. Handles @type( ... ) or @type{ ... }
    1.49 -  11. The keywords field is split on , or ; and put into separate xml
    1.50 -      "bibxml:keywords" tags
    1.51 -  12. Ignores @preamble
    1.52 -
    1.53 -  Known Limitations
    1.54 -  1.  Does not transform Latex encoding like math mode and special
    1.55 -      latex symbols.
    1.56 -  2.  Does not parse author fields into first and last names.
    1.57 -      E.g., It does not do anything special to an author whose name is
    1.58 -      in the form LAST_NAME, FIRST_NAME
    1.59 -      In "author" tag, will show up as
    1.60 -      <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
    1.61 -  3.  Does not handle "crossref" fields other than to print
    1.62 -      <bibxml:crossref>...</bibxml:crossref>
    1.63 -  4.  Does not inform user of the input's format errors.  You just won't
    1.64 -      be able to transform the file later with XSL
    1.65 -
    1.66 -  You will have to manually edit the XML output if you need to handle
    1.67 -  these (and unknown) limitations.
    1.68 -
    1.69 -"""
    1.70 -
    1.71 -import string, re
    1.72 -
    1.73 -# set of valid name characters
    1.74 -valid_name_chars = '[\w\-:]'
    1.75 -
    1.76 -#
    1.77 -# define global regular expression variables
    1.78 -#
    1.79 -author_rex = re.compile('\s+and\s+')
    1.80 -rembraces_rex = re.compile('[{}]')
    1.81 -capitalize_rex = re.compile('({[^}]*})')
    1.82 -
    1.83 -# used by bibtexkeywords(data)
    1.84 -keywords_rex = re.compile('[,;]')
    1.85 -
    1.86 -# used by concat_line(line)
    1.87 -concatsplit_rex = re.compile('\s*#\s*')
    1.88 -
    1.89 -# split on {, }, or " in verify_out_of_braces
    1.90 -delimiter_rex = re.compile('([{}"])',re.I)
    1.91 -
    1.92 -field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
    1.93 -data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
    1.94 -
    1.95 -url_rex = re.compile('\\\url\{([^}]*)\}')
    1.96 -
    1.97 -#
    1.98 -# styles for html formatting
    1.99 -#
   1.100 -divstyle = 'margin-top: -4ex; margin-left: 8em;'
   1.101 -
   1.102 -#
   1.103 -# return the string parameter without braces
   1.104 -#
   1.105 -def transformurls(str):
   1.106 -    return url_rex.sub(r'<a href="\1">\1</a>', str)
   1.107 -
   1.108 -#
   1.109 -# return the string parameter without braces
   1.110 -#
   1.111 -def removebraces(str):
   1.112 -    return rembraces_rex.sub('', str)
   1.113 -
   1.114 -#
   1.115 -# latex-specific replacements
   1.116 -# (do this after braces were removed)
   1.117 -#
   1.118 -def latexreplacements(line):
   1.119 -    line = string.replace(line, '~', '&nbsp;')
   1.120 -    line = string.replace(line, '\\\'a', '&aacute;')
   1.121 -    line = string.replace(line, '\\"a', '&auml;')
   1.122 -    line = string.replace(line, '\\\'e', '&eacute;')
   1.123 -    line = string.replace(line, '\\"e', '&euml;')
   1.124 -    line = string.replace(line, '\\\'i', '&iacute;')
   1.125 -    line = string.replace(line, '\\"i', '&iuml;')
   1.126 -    line = string.replace(line, '\\\'o', '&oacute;')
   1.127 -    line = string.replace(line, '\\"o', '&ouml;')
   1.128 -    line = string.replace(line, '\\\'u', '&uacute;')
   1.129 -    line = string.replace(line, '\\"u', '&uuml;')
   1.130 -    line = string.replace(line, '\\H o', '&otilde;')
   1.131 -    line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
   1.132 -    line = string.replace(line, '\\\'A', '&Aacute;')
   1.133 -    line = string.replace(line, '\\"A', '&Auml;')
   1.134 -    line = string.replace(line, '\\\'E', '&Eacute;')
   1.135 -    line = string.replace(line, '\\"E', '&Euml;')
   1.136 -    line = string.replace(line, '\\\'I', '&Iacute;')
   1.137 -    line = string.replace(line, '\\"I', '&Iuml;')
   1.138 -    line = string.replace(line, '\\\'O', '&Oacute;')
   1.139 -    line = string.replace(line, '\\"O', '&Ouml;')
   1.140 -    line = string.replace(line, '\\\'U', '&Uacute;')
   1.141 -    line = string.replace(line, '\\"U', '&Uuml;')
   1.142 -    line = string.replace(line, '\\H O', '&Otilde;')
   1.143 -    line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
   1.144 -
   1.145 -    return line
   1.146 -
   1.147 -#
   1.148 -# copy characters form a string decoding html expressions (&xyz;)
   1.149 -#
   1.150 -def copychars(str, ifrom, count):
   1.151 -    result = ''
   1.152 -    i = ifrom
   1.153 -    c = 0
   1.154 -    html_spec = False
   1.155 -    while (i < len(str)) and (c < count):
   1.156 -        if str[i] == '&':
   1.157 -            html_spec = True;
   1.158 -            if i+1 < len(str):
   1.159 -                result += str[i+1]
   1.160 -            c += 1
   1.161 -            i += 2
   1.162 -        else:
   1.163 -            if not html_spec:
   1.164 -                if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
   1.165 -                   ((str[i] >= 'a') and (str[i] <= 'z')):
   1.166 -                    result += str[i]
   1.167 -                    c += 1
   1.168 -            elif str[i] == ';':
   1.169 -                html_spec = False;
   1.170 -            i += 1
   1.171 -    
   1.172 -    return result
   1.173 -
   1.174 -
   1.175 -# 
   1.176 -# Handle a list of authors (separated by 'and').
   1.177 -# It gives back an array of the follwing values:
   1.178 -#  - num: the number of authors,
   1.179 -#  - list: the list of the author names,
   1.180 -#  - text: the bibtex text (separated by commas and/or 'and')
   1.181 -#  - abbrev: abbreviation that can be used for indicate the
   1.182 -#    bibliography entries
   1.183 -#
   1.184 -def bibtexauthor(data):
   1.185 -    result = {}
   1.186 -    bibtex = ''
   1.187 -    result['list'] = author_rex.split(data)
   1.188 -    result['num'] = len(result['list'])
   1.189 -    for i, author in enumerate(result['list']):
   1.190 -        # general transformations
   1.191 -        author = latexreplacements(removebraces(author.strip()))
   1.192 -        # transform "Xyz, A. B." to "A. B. Xyz"
   1.193 -        pos = author.find(',')
   1.194 -        if pos != -1:
   1.195 -            author = author[pos+1:].strip() + ' ' + author[:pos].strip()
   1.196 -        result['list'][i] = author
   1.197 -        bibtex += author + '#'
   1.198 -    bibtex = bibtex[:-1]
   1.199 -    if result['num'] > 1:
   1.200 -        ix = bibtex.rfind('#')
   1.201 -        if result['num'] == 2:
   1.202 -            bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
   1.203 -        else:
   1.204 -            bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
   1.205 -    bibtex = bibtex.replace('#', ', ')
   1.206 -    result['text'] = bibtex
   1.207 -    
   1.208 -    result['abbrev'] = ''
   1.209 -    for author in result['list']:
   1.210 -        pos = author.rfind(' ') + 1
   1.211 -        count = 1
   1.212 -        if result['num'] == 1:
   1.213 -            count = 3
   1.214 -        result['abbrev'] += copychars(author, pos, count)
   1.215 -
   1.216 -    return result
   1.217 -
   1.218 -
   1.219 -#
   1.220 -# data = title string
   1.221 -# @return the capitalized title (first letter is capitalized), rest are capitalized
   1.222 -# only if capitalized inside braces
   1.223 -#
   1.224 -def capitalizetitle(data):
   1.225 -    title_list = capitalize_rex.split(data)
   1.226 -    title = ''
   1.227 -    count = 0
   1.228 -    for phrase in title_list:
   1.229 -         check = string.lstrip(phrase)
   1.230 -
   1.231 -         # keep phrase's capitalization the same
   1.232 -         if check.find('{') == 0:
   1.233 -              title += removebraces(phrase)
   1.234 -         else:
   1.235 -         # first word --> capitalize first letter (after spaces)
   1.236 -              if count == 0:
   1.237 -                  title += check.capitalize()
   1.238 -              else:
   1.239 -                  title += phrase.lower()
   1.240 -         count = count + 1
   1.241 -
   1.242 -    return title
   1.243 -
   1.244 -
   1.245 -#
   1.246 -# @return the bibtex for the title
   1.247 -# @param data --> title string
   1.248 -# braces are removed from title
   1.249 -#
   1.250 -def bibtextitle(data, entrytype):
   1.251 -    if entrytype in ('book', 'inbook'):
   1.252 -        title = removebraces(data.strip())
   1.253 -    else:
   1.254 -        title = removebraces(capitalizetitle(data.strip()))
   1.255 -    bibtex = title
   1.256 -    return bibtex
   1.257 -
   1.258 -
   1.259 -#
   1.260 -# function to compare entry lists
   1.261 -#
   1.262 -def entry_cmp(x, y):
   1.263 -    return cmp(x[0], y[0])
   1.264 -
   1.265 -
   1.266 -#
   1.267 -# print the XML for the transformed "filecont_source"
   1.268 -#
   1.269 -def bibtexdecoder(filecont_source):
   1.270 -    filecont = []
   1.271 -    file = []
   1.272 -    
   1.273 -    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   1.274 -    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
   1.275 -    endtype_rex = re.compile('}\s*$')
   1.276 -    endtag_rex = re.compile('^\s*}\s*$')
   1.277 -
   1.278 -    bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   1.279 -    bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
   1.280 -
   1.281 -    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   1.282 -    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
   1.283 -
   1.284 -    for line in filecont_source:
   1.285 -        line = line[:-1]
   1.286 -
   1.287 -        # encode character entities
   1.288 -        line = string.replace(line, '&', '&amp;')
   1.289 -        line = string.replace(line, '<', '&lt;')
   1.290 -        line = string.replace(line, '>', '&gt;')
   1.291 -
   1.292 -        # start entry: publication type (store for later use)
   1.293 -        if pubtype_rex.match(line):
   1.294 -        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   1.295 -            entrycont = {}
   1.296 -            entry = []
   1.297 -            entrytype = pubtype_rex.sub('\g<1>',line)
   1.298 -            entrytype = string.lower(entrytype)
   1.299 -            entryid   = pubtype_rex.sub('\g<2>', line)
   1.300 -
   1.301 -        # end entry if just a }
   1.302 -        elif endtype_rex.match(line):
   1.303 -            # generate doxygen code for the entry
   1.304 -
   1.305 -            # enty type related formattings
   1.306 -            if entrytype in ('book', 'inbook'):
   1.307 -                entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
   1.308 -                if not entrycont.has_key('author'):
   1.309 -                    entrycont['author'] = entrycont['editor']
   1.310 -                    entrycont['author']['text'] += ', editors'
   1.311 -            elif entrytype == 'article':
   1.312 -                entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
   1.313 -            elif entrytype in ('inproceedings', 'incollection', 'conference'):
   1.314 -                entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
   1.315 -            elif entrytype == 'techreport':
   1.316 -                if not entrycont.has_key('type'):
   1.317 -                    entrycont['type'] = 'Technical report'
   1.318 -            elif entrytype == 'mastersthesis':
   1.319 -                entrycont['type'] = 'Master\'s thesis'
   1.320 -            elif entrytype == 'phdthesis':
   1.321 -                entrycont['type'] = 'PhD thesis'
   1.322 -
   1.323 -            for eline in entrycont:
   1.324 -                if eline != '':
   1.325 -                    eline = latexreplacements(eline)
   1.326 -
   1.327 -            if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.328 -                entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
   1.329 -
   1.330 -            if entrycont.has_key('author') and (entrycont['author'] != ''):
   1.331 -                entry.append(entrycont['author']['text'] + '.')
   1.332 -            if entrycont.has_key('title') and (entrycont['title'] != ''):
   1.333 -                entry.append(entrycont['title'] + '.')
   1.334 -            if entrycont.has_key('journal') and (entrycont['journal'] != ''):
   1.335 -                entry.append(entrycont['journal'] + ',')
   1.336 -            if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
   1.337 -                entry.append('In ' + entrycont['booktitle'] + ',')
   1.338 -            if entrycont.has_key('type') and (entrycont['type'] != ''):
   1.339 -                eline = entrycont['type']
   1.340 -                if entrycont.has_key('number') and (entrycont['number'] != ''):
   1.341 -                    eline += ' ' + entrycont['number']
   1.342 -                eline += ','
   1.343 -                entry.append(eline)
   1.344 -            if entrycont.has_key('institution') and (entrycont['institution'] != ''):
   1.345 -                entry.append(entrycont['institution'] + ',')
   1.346 -            if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
   1.347 -                entry.append(entrycont['publisher'] + ',')
   1.348 -            if entrycont.has_key('school') and (entrycont['school'] != ''):
   1.349 -                entry.append(entrycont['school'] + ',')
   1.350 -            if entrycont.has_key('address') and (entrycont['address'] != ''):
   1.351 -                entry.append(entrycont['address'] + ',')
   1.352 -            if entrycont.has_key('edition') and (entrycont['edition'] != ''):
   1.353 -                entry.append(entrycont['edition'] + ' edition,')
   1.354 -            if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
   1.355 -                entry.append(entrycont['howpublished'] + ',')
   1.356 -            if entrycont.has_key('volume') and (entrycont['volume'] != ''):
   1.357 -                eline = entrycont['volume'];
   1.358 -                if entrycont.has_key('number') and (entrycont['number'] != ''):
   1.359 -                    eline += '(' + entrycont['number'] + ')'
   1.360 -                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.361 -                    eline += ':' + entrycont['pages']
   1.362 -                eline += ','
   1.363 -                entry.append(eline)
   1.364 -            else:
   1.365 -                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.366 -                    entry.append('pages ' + entrycont['pages'] + ',')
   1.367 -            if entrycont.has_key('year') and (entrycont['year'] != ''):
   1.368 -                if entrycont.has_key('month') and (entrycont['month'] != ''):
   1.369 -                    entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
   1.370 -                else:
   1.371 -                    entry.append(entrycont['year'] + '.')
   1.372 -            if entrycont.has_key('note') and (entrycont['note'] != ''):
   1.373 -                entry.append(entrycont['note'] + '.')
   1.374 -            if entrycont.has_key('url') and (entrycont['url'] != ''):
   1.375 -                entry.append(entrycont['url'] + '.')
   1.376 -
   1.377 -            # generate keys for sorting and for the output
   1.378 -            sortkey = ''
   1.379 -            bibkey = ''
   1.380 -            if entrycont.has_key('author'):
   1.381 -                for author in entrycont['author']['list']:
   1.382 -                    sortkey += copychars(author, author.rfind(' ')+1, len(author))
   1.383 -                bibkey = entrycont['author']['abbrev']
   1.384 -            else:
   1.385 -                bibkey = 'x'
   1.386 -            if entrycont.has_key('year'):
   1.387 -                sortkey += entrycont['year']
   1.388 -                bibkey += entrycont['year'][-2:]
   1.389 -            if entrycont.has_key('title'):
   1.390 -                sortkey += entrycont['title']
   1.391 -            if entrycont.has_key('key'):
   1.392 -                sortkey = entrycont['key'] + sortkey
   1.393 -                bibkey = entrycont['key']
   1.394 -            entry.insert(0, sortkey)
   1.395 -            entry.insert(1, bibkey)
   1.396 -            entry.insert(2, entryid)
   1.397 -           
   1.398 -            # add the entry to the file contents
   1.399 -            filecont.append(entry)
   1.400 -
   1.401 -        else:
   1.402 -            # field, publication info
   1.403 -            field = ''
   1.404 -            data = ''
   1.405 -            
   1.406 -            # field = {data} entries
   1.407 -            if bracedata_rex.match(line):
   1.408 -                field = bracefield_rex.sub('\g<1>', line)
   1.409 -                field = string.lower(field)
   1.410 -                data =  bracedata_rex.sub('\g<2>', line)
   1.411 -
   1.412 -            # field = "data" entries
   1.413 -            elif quotedata_rex.match(line):
   1.414 -                field = quotefield_rex.sub('\g<1>', line)
   1.415 -                field = string.lower(field)
   1.416 -                data =  quotedata_rex.sub('\g<2>', line)
   1.417 -
   1.418 -            # field = data entries
   1.419 -            elif data_rex.match(line):
   1.420 -                field = field_rex.sub('\g<1>', line)
   1.421 -                field = string.lower(field)
   1.422 -                data =  data_rex.sub('\g<2>', line)
   1.423 -
   1.424 -            if field == 'url':
   1.425 -                data = '\\url{' + data.strip() + '}'
   1.426 -            
   1.427 -            if field in ('author', 'editor'):
   1.428 -                entrycont[field] = bibtexauthor(data)
   1.429 -                line = ''
   1.430 -            elif field == 'title':
   1.431 -                line = bibtextitle(data, entrytype)
   1.432 -            elif field != '':
   1.433 -                line = removebraces(transformurls(data.strip()))
   1.434 -
   1.435 -            if line != '':
   1.436 -                line = latexreplacements(line)
   1.437 -                entrycont[field] = line
   1.438 -
   1.439 -
   1.440 -    # sort entries
   1.441 -    filecont.sort(entry_cmp)
   1.442 -    
   1.443 -    # count the bibtex keys
   1.444 -    keytable = {}
   1.445 -    counttable = {}
   1.446 -    for entry in filecont:
   1.447 -        bibkey = entry[1]
   1.448 -        if not keytable.has_key(bibkey):
   1.449 -            keytable[bibkey] = 1
   1.450 -        else:
   1.451 -            keytable[bibkey] += 1
   1.452 -
   1.453 -    for bibkey in keytable.keys():
   1.454 -        counttable[bibkey] = 0
   1.455 -    
   1.456 -    # generate output
   1.457 -    for entry in filecont:
   1.458 -        # generate output key form the bibtex key
   1.459 -        bibkey = entry[1]
   1.460 -        entryid = entry[2]
   1.461 -        if keytable[bibkey] == 1:
   1.462 -            outkey = bibkey
   1.463 -        else:
   1.464 -            outkey = bibkey + chr(97 + counttable[bibkey])
   1.465 -        counttable[bibkey] += 1
   1.466 -        
   1.467 -        # append the entry code to the output
   1.468 -        file.append('\\section ' + entryid + ' [' + outkey + ']')
   1.469 -        file.append('<div style="' + divstyle + '">')
   1.470 -        for line in entry[3:]:
   1.471 -            file.append(line)
   1.472 -        file.append('</div>')
   1.473 -        file.append('')
   1.474 -
   1.475 -    return file
   1.476 -
   1.477 -
   1.478 -#
   1.479 -# return 1 iff abbr is in line but not inside braces or quotes
   1.480 -# assumes that abbr appears only once on the line (out of braces and quotes)
   1.481 -#
   1.482 -def verify_out_of_braces(line, abbr):
   1.483 -
   1.484 -    phrase_split = delimiter_rex.split(line)
   1.485 -
   1.486 -    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
   1.487 -
   1.488 -    open_brace = 0
   1.489 -    open_quote = 0
   1.490 -
   1.491 -    for phrase in phrase_split:
   1.492 -        if phrase == "{":
   1.493 -            open_brace = open_brace + 1
   1.494 -        elif phrase == "}":
   1.495 -            open_brace = open_brace - 1
   1.496 -        elif phrase == '"':
   1.497 -            if open_quote == 1:
   1.498 -                open_quote = 0
   1.499 -            else:
   1.500 -                open_quote = 1
   1.501 -        elif abbr_rex.search(phrase):
   1.502 -            if open_brace == 0 and open_quote == 0:
   1.503 -                return 1
   1.504 -
   1.505 -    return 0
   1.506 -
   1.507 -
   1.508 -#
   1.509 -# a line in the form phrase1 # phrase2 # ... # phrasen
   1.510 -# is returned as phrase1 phrase2 ... phrasen
   1.511 -# with the correct punctuation
   1.512 -# Bug: Doesn't always work with multiple abbreviations plugged in
   1.513 -#
   1.514 -def concat_line(line):
   1.515 -    # only look at part after equals
   1.516 -    field = field_rex.sub('\g<1>',line)
   1.517 -    rest = field_rex.sub('\g<2>',line)
   1.518 -
   1.519 -    concat_line = field + ' ='
   1.520 -
   1.521 -    pound_split = concatsplit_rex.split(rest)
   1.522 -
   1.523 -    phrase_count = 0
   1.524 -    length = len(pound_split)
   1.525 -
   1.526 -    for phrase in pound_split:
   1.527 -        phrase = phrase.strip()
   1.528 -        if phrase_count != 0:
   1.529 -            if phrase.startswith('"') or phrase.startswith('{'):
   1.530 -                phrase = phrase[1:]
   1.531 -        elif phrase.startswith('"'):
   1.532 -            phrase = phrase.replace('"','{',1)
   1.533 -
   1.534 -        if phrase_count != length-1:
   1.535 -            if phrase.endswith('"') or phrase.endswith('}'):
   1.536 -                phrase = phrase[:-1]
   1.537 -        else:
   1.538 -            if phrase.endswith('"'):
   1.539 -                phrase = phrase[:-1]
   1.540 -                phrase = phrase + "}"
   1.541 -            elif phrase.endswith('",'):
   1.542 -                phrase = phrase[:-2]
   1.543 -                phrase = phrase + "},"
   1.544 -
   1.545 -        # if phrase did have \#, add the \# back
   1.546 -        if phrase.endswith('\\'):
   1.547 -            phrase = phrase + "#"
   1.548 -        concat_line = concat_line + ' ' + phrase
   1.549 -
   1.550 -        phrase_count = phrase_count + 1
   1.551 -
   1.552 -    return concat_line
   1.553 -
   1.554 -
   1.555 -#
   1.556 -# substitute abbreviations into filecont
   1.557 -# @param filecont_source - string of data from file
   1.558 -#
   1.559 -def bibtex_replace_abbreviations(filecont_source):
   1.560 -    filecont = filecont_source.splitlines()
   1.561 -
   1.562 -    #  These are defined in bibtex, so we'll define them too
   1.563 -    abbr_list = ['jan','feb','mar','apr','may','jun',
   1.564 -                 'jul','aug','sep','oct','nov','dec']
   1.565 -    value_list = ['January','February','March','April',
   1.566 -                  'May','June','July','August','September',
   1.567 -                  'October','November','December']
   1.568 -
   1.569 -    abbr_rex = []
   1.570 -    total_abbr_count = 0
   1.571 -
   1.572 -    front = '\\b'
   1.573 -    back = '(,?)\\b'
   1.574 -
   1.575 -    for x in abbr_list:
   1.576 -        abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   1.577 -        total_abbr_count = total_abbr_count + 1
   1.578 -
   1.579 -
   1.580 -    abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
   1.581 -                             re.I)
   1.582 -
   1.583 -    comment_rex = re.compile('@comment\s*{',re.I)
   1.584 -    preamble_rex = re.compile('@preamble\s*{',re.I)
   1.585 -
   1.586 -    waiting_for_end_string = 0
   1.587 -    i = 0
   1.588 -    filecont2 = ''
   1.589 -
   1.590 -    for line in filecont:
   1.591 -        if line == ' ' or line == '':
   1.592 -            continue
   1.593 -
   1.594 -        if waiting_for_end_string:
   1.595 -            if re.search('}',line):
   1.596 -                waiting_for_end_string = 0
   1.597 -                continue
   1.598 -
   1.599 -        if abbrdef_rex.search(line):
   1.600 -            abbr = abbrdef_rex.sub('\g<1>', line)
   1.601 -
   1.602 -            if abbr_list.count(abbr) == 0:
   1.603 -                val = abbrdef_rex.sub('\g<2>', line)
   1.604 -                abbr_list.append(abbr)
   1.605 -                value_list.append(string.strip(val))
   1.606 -                abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   1.607 -                total_abbr_count = total_abbr_count + 1
   1.608 -            waiting_for_end_string = 1
   1.609 -            continue
   1.610 -
   1.611 -        if comment_rex.search(line):
   1.612 -            waiting_for_end_string = 1
   1.613 -            continue
   1.614 -
   1.615 -        if preamble_rex.search(line):
   1.616 -            waiting_for_end_string = 1
   1.617 -            continue
   1.618 -
   1.619 -
   1.620 -        # replace subsequent abbreviations with the value
   1.621 -        abbr_count = 0
   1.622 -
   1.623 -        for x in abbr_list:
   1.624 -
   1.625 -            if abbr_rex[abbr_count].search(line):
   1.626 -                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
   1.627 -                    line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
   1.628 -                # Check for # concatenations
   1.629 -                if concatsplit_rex.search(line):
   1.630 -                    line = concat_line(line)
   1.631 -            abbr_count = abbr_count + 1
   1.632 -
   1.633 -
   1.634 -        filecont2 = filecont2 + line + '\n'
   1.635 -        i = i+1
   1.636 -
   1.637 -
   1.638 -    # Do one final pass over file
   1.639 -
   1.640 -    # make sure that didn't end up with {" or }" after the substitution
   1.641 -    filecont2 = filecont2.replace('{"','{{')
   1.642 -    filecont2 = filecont2.replace('"}','}}')
   1.643 -
   1.644 -    afterquotevalue_rex = re.compile('"\s*,\s*')
   1.645 -    afterbrace_rex = re.compile('"\s*}')
   1.646 -    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
   1.647 -
   1.648 -    # add new lines to data that changed because of abbreviation substitutions
   1.649 -    filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
   1.650 -    filecont2 = afterbrace_rex.sub('"\n}', filecont2)
   1.651 -    filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
   1.652 -
   1.653 -    return filecont2
   1.654 -
   1.655 -#
   1.656 -# convert @type( ... ) to @type{ ... }
   1.657 -#
   1.658 -def no_outer_parens(filecont):
   1.659 -
   1.660 -    # do checking for open parens
   1.661 -    # will convert to braces
   1.662 -    paren_split = re.split('([(){}])',filecont)
   1.663 -
   1.664 -    open_paren_count = 0
   1.665 -    open_type = 0
   1.666 -    look_next = 0
   1.667 -
   1.668 -    # rebuild filecont
   1.669 -    filecont = ''
   1.670 -
   1.671 -    at_rex = re.compile('@\w*')
   1.672 -
   1.673 -    for phrase in paren_split:
   1.674 -        if look_next == 1:
   1.675 -            if phrase == '(':
   1.676 -                phrase = '{'
   1.677 -                open_paren_count = open_paren_count + 1
   1.678 -            else:
   1.679 -                open_type = 0
   1.680 -            look_next = 0
   1.681 -
   1.682 -        if phrase == '(':
   1.683 -            open_paren_count = open_paren_count + 1
   1.684 -
   1.685 -        elif phrase == ')':
   1.686 -            open_paren_count = open_paren_count - 1
   1.687 -            if open_type == 1 and open_paren_count == 0:
   1.688 -                phrase = '}'
   1.689 -                open_type = 0
   1.690 -
   1.691 -        elif at_rex.search( phrase ):
   1.692 -            open_type = 1
   1.693 -            look_next = 1
   1.694 -
   1.695 -        filecont = filecont + phrase
   1.696 -
   1.697 -    return filecont
   1.698 -
   1.699 -
   1.700 -#
   1.701 -# make all whitespace into just one space
   1.702 -# format the bibtex file into a usable form.
   1.703 -#
   1.704 -def bibtexwasher(filecont_source):
   1.705 -
   1.706 -    space_rex = re.compile('\s+')
   1.707 -    comment_rex = re.compile('\s*%')
   1.708 -
   1.709 -    filecont = []
   1.710 -
   1.711 -    # remove trailing and excessive whitespace
   1.712 -    # ignore comments
   1.713 -    for line in filecont_source:
   1.714 -        line = string.strip(line)
   1.715 -        line = space_rex.sub(' ', line)
   1.716 -        # ignore comments
   1.717 -        if not comment_rex.match(line) and line != '':
   1.718 -            filecont.append(' '+ line)
   1.719 -
   1.720 -    filecont = string.join(filecont, '')
   1.721 -
   1.722 -    # the file is in one long string
   1.723 -
   1.724 -    filecont = no_outer_parens(filecont)
   1.725 -
   1.726 -    #
   1.727 -    # split lines according to preferred syntax scheme
   1.728 -    #
   1.729 -    filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
   1.730 -
   1.731 -    # add new lines after commas that are after values
   1.732 -    filecont = re.sub('"\s*,', '",\n', filecont)
   1.733 -    filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
   1.734 -    filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
   1.735 -                          '\n\n\g<1>\g<2>,\n', filecont)
   1.736 -
   1.737 -    # add new lines after }
   1.738 -    filecont = re.sub('"\s*}','"\n}\n', filecont)
   1.739 -    filecont = re.sub('}\s*,','},\n', filecont)
   1.740 -
   1.741 -
   1.742 -    filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
   1.743 -
   1.744 -    # character encoding, reserved latex characters
   1.745 -    filecont = re.sub('{\\\&}', '&', filecont)
   1.746 -    filecont = re.sub('\\\&', '&', filecont)
   1.747 -
   1.748 -    # do checking for open braces to get format correct
   1.749 -    open_brace_count = 0
   1.750 -    brace_split = re.split('([{}])',filecont)
   1.751 -
   1.752 -    # rebuild filecont
   1.753 -    filecont = ''
   1.754 -
   1.755 -    for phrase in brace_split:
   1.756 -        if phrase == '{':
   1.757 -            open_brace_count = open_brace_count + 1
   1.758 -        elif phrase == '}':
   1.759 -            open_brace_count = open_brace_count - 1
   1.760 -            if open_brace_count == 0:
   1.761 -                filecont = filecont + '\n'
   1.762 -
   1.763 -        filecont = filecont + phrase
   1.764 -
   1.765 -    filecont2 = bibtex_replace_abbreviations(filecont)
   1.766 -
   1.767 -    # gather
   1.768 -    filecont = filecont2.splitlines()
   1.769 -    i=0
   1.770 -    j=0         # count the number of blank lines
   1.771 -    for line in filecont:
   1.772 -        # ignore blank lines
   1.773 -        if line == '' or line == ' ':
   1.774 -            j = j+1
   1.775 -            continue
   1.776 -        filecont[i] = line + '\n'
   1.777 -        i = i+1
   1.778 -
   1.779 -    # get rid of the extra stuff at the end of the array
   1.780 -    # (The extra stuff are duplicates that are in the array because
   1.781 -    # blank lines were removed.)
   1.782 -    length = len( filecont)
   1.783 -    filecont[length-j:length] = []
   1.784 -
   1.785 -    return filecont
   1.786 -
   1.787 -
   1.788 -def filehandler(filepath):
   1.789 -    try:
   1.790 -        fd = open(filepath, 'r')
   1.791 -        filecont_source = fd.readlines()
   1.792 -        fd.close()
   1.793 -    except:
   1.794 -        print 'Could not open file:', filepath
   1.795 -    washeddata = bibtexwasher(filecont_source)
   1.796 -    outdata = bibtexdecoder(washeddata)
   1.797 -    print '/**'
   1.798 -    print '\page references References'
   1.799 -    print
   1.800 -    for line in outdata:
   1.801 -        print line
   1.802 -    print '*/'
   1.803 -
   1.804 -
   1.805 -# main program
   1.806 -
   1.807 -def main():
   1.808 -    import sys
   1.809 -    if sys.argv[1:]:
   1.810 -        filepath = sys.argv[1]
   1.811 -    else:
   1.812 -        print "No input file"
   1.813 -        sys.exit()
   1.814 -    filehandler(filepath)
   1.815 -
   1.816 -if __name__ == "__main__": main()
   1.817 -
   1.818 -
   1.819 -# end python script