1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/scripts/bib2dox.py	Sun Aug 11 15:28:12 2013 +0200
     1.3 @@ -0,0 +1,816 @@
     1.4 +#! /usr/bin/env python
     1.5 +"""
     1.6 +  BibTeX to Doxygen converter
     1.7 +  Usage: python bib2dox.py bibfile.bib > bibfile.dox
     1.8 +
     1.9 +  This file is a part of LEMON, a generic C++ optimization library.
    1.10 +
    1.11 +  **********************************************************************
    1.12 +
    1.13 +  This code is the modification of the BibTeX to XML converter
    1.14 +  by Vidar Bronken Gundersen et al.
    1.15 +  See the original copyright notices below. 
    1.16 +
    1.17 +  **********************************************************************
    1.18 +
    1.19 +  Decoder for bibliographic data, BibTeX
    1.20 +  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
    1.21 +
    1.22 +  v.8
    1.23 +  (c)2002-06-23 Vidar Bronken Gundersen
    1.24 +  http://bibtexml.sf.net/
    1.25 +  Reuse approved as long as this notification is kept.
    1.26 +  Licence: GPL.
    1.27 +
    1.28 +  Contributions/thanks to:
    1.29 +  Egon Willighagen, http://sf.net/projects/jreferences/
    1.30 +  Richard Mahoney (for providing a test case)
    1.31 +
    1.32 +  Editted by Sara Sprenkle to be more robust and handle more bibtex features.
    1.33 +  (c) 2003-01-15
    1.34 +
    1.35 +  1.  Changed bibtex: tags to bibxml: tags.
    1.36 +  2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
    1.37 +  3.  Allow spaces between @type and first {
    1.38 +  4.  "author" fields with multiple authors split by " and "
    1.39 +      are put in separate xml "bibxml:author" tags.
    1.40 +  5.  Option for Titles: words are capitalized
    1.41 +      only if first letter in title or capitalized inside braces
    1.42 +  6.  Removes braces from within field values
    1.43 +  7.  Ignores comments in bibtex file (including @comment{ or % )
    1.44 +  8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
    1.45 +  9.  Handles bibtex @string abbreviations
    1.46 +        --> includes bibtex's default abbreviations for months
    1.47 +        --> does concatenation of abbr # " more " and " more " # abbr
    1.48 +  10. Handles @type( ... ) or @type{ ... }
    1.49 +  11. The keywords field is split on , or ; and put into separate xml
    1.50 +      "bibxml:keywords" tags
    1.51 +  12. Ignores @preamble
    1.52 +
    1.53 +  Known Limitations
    1.54 +  1.  Does not transform Latex encoding like math mode and special
    1.55 +      latex symbols.
    1.56 +  2.  Does not parse author fields into first and last names.
    1.57 +      E.g., It does not do anything special to an author whose name is
    1.58 +      in the form LAST_NAME, FIRST_NAME
    1.59 +      In "author" tag, will show up as
    1.60 +      <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
    1.61 +  3.  Does not handle "crossref" fields other than to print
    1.62 +      <bibxml:crossref>...</bibxml:crossref>
    1.63 +  4.  Does not inform user of the input's format errors.  You just won't
    1.64 +      be able to transform the file later with XSL
    1.65 +
    1.66 +  You will have to manually edit the XML output if you need to handle
    1.67 +  these (and unknown) limitations.
    1.68 +
    1.69 +"""
    1.70 +
    1.71 +import string, re
    1.72 +
    1.73 +# set of valid name characters
    1.74 +valid_name_chars = '[\w\-:]'
    1.75 +
    1.76 +#
    1.77 +# define global regular expression variables
    1.78 +#
    1.79 +author_rex = re.compile('\s+and\s+')
    1.80 +rembraces_rex = re.compile('[{}]')
    1.81 +capitalize_rex = re.compile('({[^}]*})')
    1.82 +
    1.83 +# used by bibtexkeywords(data)
    1.84 +keywords_rex = re.compile('[,;]')
    1.85 +
    1.86 +# used by concat_line(line)
    1.87 +concatsplit_rex = re.compile('\s*#\s*')
    1.88 +
    1.89 +# split on {, }, or " in verify_out_of_braces
    1.90 +delimiter_rex = re.compile('([{}"])',re.I)
    1.91 +
    1.92 +field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
    1.93 +data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
    1.94 +
    1.95 +url_rex = re.compile('\\\url\{([^}]*)\}')
    1.96 +
    1.97 +#
    1.98 +# styles for html formatting
    1.99 +#
   1.100 +divstyle = 'margin-top: -4ex; margin-left: 8em;'
   1.101 +
   1.102 +#
   1.103 +# return the string parameter without braces
   1.104 +#
   1.105 +def transformurls(str):
   1.106 +    return url_rex.sub(r'<a href="\1">\1</a>', str)
   1.107 +
   1.108 +#
   1.109 +# return the string parameter without braces
   1.110 +#
   1.111 +def removebraces(str):
   1.112 +    return rembraces_rex.sub('', str)
   1.113 +
   1.114 +#
   1.115 +# latex-specific replacements
   1.116 +# (do this after braces were removed)
   1.117 +#
   1.118 +def latexreplacements(line):
   1.119 +    line = string.replace(line, '~', '&nbsp;')
   1.120 +    line = string.replace(line, '\\\'a', '&aacute;')
   1.121 +    line = string.replace(line, '\\"a', '&auml;')
   1.122 +    line = string.replace(line, '\\\'e', '&eacute;')
   1.123 +    line = string.replace(line, '\\"e', '&euml;')
   1.124 +    line = string.replace(line, '\\\'i', '&iacute;')
   1.125 +    line = string.replace(line, '\\"i', '&iuml;')
   1.126 +    line = string.replace(line, '\\\'o', '&oacute;')
   1.127 +    line = string.replace(line, '\\"o', '&ouml;')
   1.128 +    line = string.replace(line, '\\\'u', '&uacute;')
   1.129 +    line = string.replace(line, '\\"u', '&uuml;')
   1.130 +    line = string.replace(line, '\\H o', '&otilde;')
   1.131 +    line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
   1.132 +    line = string.replace(line, '\\\'A', '&Aacute;')
   1.133 +    line = string.replace(line, '\\"A', '&Auml;')
   1.134 +    line = string.replace(line, '\\\'E', '&Eacute;')
   1.135 +    line = string.replace(line, '\\"E', '&Euml;')
   1.136 +    line = string.replace(line, '\\\'I', '&Iacute;')
   1.137 +    line = string.replace(line, '\\"I', '&Iuml;')
   1.138 +    line = string.replace(line, '\\\'O', '&Oacute;')
   1.139 +    line = string.replace(line, '\\"O', '&Ouml;')
   1.140 +    line = string.replace(line, '\\\'U', '&Uacute;')
   1.141 +    line = string.replace(line, '\\"U', '&Uuml;')
   1.142 +    line = string.replace(line, '\\H O', '&Otilde;')
   1.143 +    line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
   1.144 +
   1.145 +    return line
   1.146 +
   1.147 +#
   1.148 +# copy characters form a string decoding html expressions (&xyz;)
   1.149 +#
   1.150 +def copychars(str, ifrom, count):
   1.151 +    result = ''
   1.152 +    i = ifrom
   1.153 +    c = 0
   1.154 +    html_spec = False
   1.155 +    while (i < len(str)) and (c < count):
   1.156 +        if str[i] == '&':
   1.157 +            html_spec = True;
   1.158 +            if i+1 < len(str):
   1.159 +                result += str[i+1]
   1.160 +            c += 1
   1.161 +            i += 2
   1.162 +        else:
   1.163 +            if not html_spec:
   1.164 +                if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
   1.165 +                   ((str[i] >= 'a') and (str[i] <= 'z')):
   1.166 +                    result += str[i]
   1.167 +                    c += 1
   1.168 +            elif str[i] == ';':
   1.169 +                html_spec = False;
   1.170 +            i += 1
   1.171 +    
   1.172 +    return result
   1.173 +
   1.174 +
   1.175 +# 
   1.176 +# Handle a list of authors (separated by 'and').
   1.177 +# It gives back an array of the follwing values:
   1.178 +#  - num: the number of authors,
   1.179 +#  - list: the list of the author names,
   1.180 +#  - text: the bibtex text (separated by commas and/or 'and')
   1.181 +#  - abbrev: abbreviation that can be used for indicate the
   1.182 +#    bibliography entries
   1.183 +#
   1.184 +def bibtexauthor(data):
   1.185 +    result = {}
   1.186 +    bibtex = ''
   1.187 +    result['list'] = author_rex.split(data)
   1.188 +    result['num'] = len(result['list'])
   1.189 +    for i, author in enumerate(result['list']):
   1.190 +        # general transformations
   1.191 +        author = latexreplacements(removebraces(author.strip()))
   1.192 +        # transform "Xyz, A. B." to "A. B. Xyz"
   1.193 +        pos = author.find(',')
   1.194 +        if pos != -1:
   1.195 +            author = author[pos+1:].strip() + ' ' + author[:pos].strip()
   1.196 +        result['list'][i] = author
   1.197 +        bibtex += author + '#'
   1.198 +    bibtex = bibtex[:-1]
   1.199 +    if result['num'] > 1:
   1.200 +        ix = bibtex.rfind('#')
   1.201 +        if result['num'] == 2:
   1.202 +            bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
   1.203 +        else:
   1.204 +            bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
   1.205 +    bibtex = bibtex.replace('#', ', ')
   1.206 +    result['text'] = bibtex
   1.207 +    
   1.208 +    result['abbrev'] = ''
   1.209 +    for author in result['list']:
   1.210 +        pos = author.rfind(' ') + 1
   1.211 +        count = 1
   1.212 +        if result['num'] == 1:
   1.213 +            count = 3
   1.214 +        result['abbrev'] += copychars(author, pos, count)
   1.215 +
   1.216 +    return result
   1.217 +
   1.218 +
   1.219 +#
   1.220 +# data = title string
   1.221 +# @return the capitalized title (first letter is capitalized), rest are capitalized
   1.222 +# only if capitalized inside braces
   1.223 +#
   1.224 +def capitalizetitle(data):
   1.225 +    title_list = capitalize_rex.split(data)
   1.226 +    title = ''
   1.227 +    count = 0
   1.228 +    for phrase in title_list:
   1.229 +         check = string.lstrip(phrase)
   1.230 +
   1.231 +         # keep phrase's capitalization the same
   1.232 +         if check.find('{') == 0:
   1.233 +              title += removebraces(phrase)
   1.234 +         else:
   1.235 +         # first word --> capitalize first letter (after spaces)
   1.236 +              if count == 0:
   1.237 +                  title += check.capitalize()
   1.238 +              else:
   1.239 +                  title += phrase.lower()
   1.240 +         count = count + 1
   1.241 +
   1.242 +    return title
   1.243 +
   1.244 +
   1.245 +#
   1.246 +# @return the bibtex for the title
   1.247 +# @param data --> title string
   1.248 +# braces are removed from title
   1.249 +#
   1.250 +def bibtextitle(data, entrytype):
   1.251 +    if entrytype in ('book', 'inbook'):
   1.252 +        title = removebraces(data.strip())
   1.253 +    else:
   1.254 +        title = removebraces(capitalizetitle(data.strip()))
   1.255 +    bibtex = title
   1.256 +    return bibtex
   1.257 +
   1.258 +
   1.259 +#
   1.260 +# function to compare entry lists
   1.261 +#
   1.262 +def entry_cmp(x, y):
   1.263 +    return cmp(x[0], y[0])
   1.264 +
   1.265 +
   1.266 +#
   1.267 +# print the XML for the transformed "filecont_source"
   1.268 +#
   1.269 +def bibtexdecoder(filecont_source):
   1.270 +    filecont = []
   1.271 +    file = []
   1.272 +    
   1.273 +    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   1.274 +    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
   1.275 +    endtype_rex = re.compile('}\s*$')
   1.276 +    endtag_rex = re.compile('^\s*}\s*$')
   1.277 +
   1.278 +    bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   1.279 +    bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
   1.280 +
   1.281 +    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   1.282 +    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
   1.283 +
   1.284 +    for line in filecont_source:
   1.285 +        line = line[:-1]
   1.286 +
   1.287 +        # encode character entities
   1.288 +        line = string.replace(line, '&', '&amp;')
   1.289 +        line = string.replace(line, '<', '&lt;')
   1.290 +        line = string.replace(line, '>', '&gt;')
   1.291 +
   1.292 +        # start entry: publication type (store for later use)
   1.293 +        if pubtype_rex.match(line):
   1.294 +        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   1.295 +            entrycont = {}
   1.296 +            entry = []
   1.297 +            entrytype = pubtype_rex.sub('\g<1>',line)
   1.298 +            entrytype = string.lower(entrytype)
   1.299 +            entryid   = pubtype_rex.sub('\g<2>', line)
   1.300 +
   1.301 +        # end entry if just a }
   1.302 +        elif endtype_rex.match(line):
   1.303 +            # generate doxygen code for the entry
   1.304 +
   1.305 +            # enty type related formattings
   1.306 +            if entrytype in ('book', 'inbook'):
   1.307 +                entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
   1.308 +                if not entrycont.has_key('author'):
   1.309 +                    entrycont['author'] = entrycont['editor']
   1.310 +                    entrycont['author']['text'] += ', editors'
   1.311 +            elif entrytype == 'article':
   1.312 +                entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
   1.313 +            elif entrytype in ('inproceedings', 'incollection', 'conference'):
   1.314 +                entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
   1.315 +            elif entrytype == 'techreport':
   1.316 +                if not entrycont.has_key('type'):
   1.317 +                    entrycont['type'] = 'Technical report'
   1.318 +            elif entrytype == 'mastersthesis':
   1.319 +                entrycont['type'] = 'Master\'s thesis'
   1.320 +            elif entrytype == 'phdthesis':
   1.321 +                entrycont['type'] = 'PhD thesis'
   1.322 +
   1.323 +            for eline in entrycont:
   1.324 +                if eline != '':
   1.325 +                    eline = latexreplacements(eline)
   1.326 +
   1.327 +            if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.328 +                entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
   1.329 +
   1.330 +            if entrycont.has_key('author') and (entrycont['author'] != ''):
   1.331 +                entry.append(entrycont['author']['text'] + '.')
   1.332 +            if entrycont.has_key('title') and (entrycont['title'] != ''):
   1.333 +                entry.append(entrycont['title'] + '.')
   1.334 +            if entrycont.has_key('journal') and (entrycont['journal'] != ''):
   1.335 +                entry.append(entrycont['journal'] + ',')
   1.336 +            if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
   1.337 +                entry.append('In ' + entrycont['booktitle'] + ',')
   1.338 +            if entrycont.has_key('type') and (entrycont['type'] != ''):
   1.339 +                eline = entrycont['type']
   1.340 +                if entrycont.has_key('number') and (entrycont['number'] != ''):
   1.341 +                    eline += ' ' + entrycont['number']
   1.342 +                eline += ','
   1.343 +                entry.append(eline)
   1.344 +            if entrycont.has_key('institution') and (entrycont['institution'] != ''):
   1.345 +                entry.append(entrycont['institution'] + ',')
   1.346 +            if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
   1.347 +                entry.append(entrycont['publisher'] + ',')
   1.348 +            if entrycont.has_key('school') and (entrycont['school'] != ''):
   1.349 +                entry.append(entrycont['school'] + ',')
   1.350 +            if entrycont.has_key('address') and (entrycont['address'] != ''):
   1.351 +                entry.append(entrycont['address'] + ',')
   1.352 +            if entrycont.has_key('edition') and (entrycont['edition'] != ''):
   1.353 +                entry.append(entrycont['edition'] + ' edition,')
   1.354 +            if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
   1.355 +                entry.append(entrycont['howpublished'] + ',')
   1.356 +            if entrycont.has_key('volume') and (entrycont['volume'] != ''):
   1.357 +                eline = entrycont['volume'];
   1.358 +                if entrycont.has_key('number') and (entrycont['number'] != ''):
   1.359 +                    eline += '(' + entrycont['number'] + ')'
   1.360 +                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.361 +                    eline += ':' + entrycont['pages']
   1.362 +                eline += ','
   1.363 +                entry.append(eline)
   1.364 +            else:
   1.365 +                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.366 +                    entry.append('pages ' + entrycont['pages'] + ',')
   1.367 +            if entrycont.has_key('year') and (entrycont['year'] != ''):
   1.368 +                if entrycont.has_key('month') and (entrycont['month'] != ''):
   1.369 +                    entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
   1.370 +                else:
   1.371 +                    entry.append(entrycont['year'] + '.')
   1.372 +            if entrycont.has_key('note') and (entrycont['note'] != ''):
   1.373 +                entry.append(entrycont['note'] + '.')
   1.374 +            if entrycont.has_key('url') and (entrycont['url'] != ''):
   1.375 +                entry.append(entrycont['url'] + '.')
   1.376 +
   1.377 +            # generate keys for sorting and for the output
   1.378 +            sortkey = ''
   1.379 +            bibkey = ''
   1.380 +            if entrycont.has_key('author'):
   1.381 +                for author in entrycont['author']['list']:
   1.382 +                    sortkey += copychars(author, author.rfind(' ')+1, len(author))
   1.383 +                bibkey = entrycont['author']['abbrev']
   1.384 +            else:
   1.385 +                bibkey = 'x'
   1.386 +            if entrycont.has_key('year'):
   1.387 +                sortkey += entrycont['year']
   1.388 +                bibkey += entrycont['year'][-2:]
   1.389 +            if entrycont.has_key('title'):
   1.390 +                sortkey += entrycont['title']
   1.391 +            if entrycont.has_key('key'):
   1.392 +                sortkey = entrycont['key'] + sortkey
   1.393 +                bibkey = entrycont['key']
   1.394 +            entry.insert(0, sortkey)
   1.395 +            entry.insert(1, bibkey)
   1.396 +            entry.insert(2, entryid)
   1.397 +           
   1.398 +            # add the entry to the file contents
   1.399 +            filecont.append(entry)
   1.400 +
   1.401 +        else:
   1.402 +            # field, publication info
   1.403 +            field = ''
   1.404 +            data = ''
   1.405 +            
   1.406 +            # field = {data} entries
   1.407 +            if bracedata_rex.match(line):
   1.408 +                field = bracefield_rex.sub('\g<1>', line)
   1.409 +                field = string.lower(field)
   1.410 +                data =  bracedata_rex.sub('\g<2>', line)
   1.411 +
   1.412 +            # field = "data" entries
   1.413 +            elif quotedata_rex.match(line):
   1.414 +                field = quotefield_rex.sub('\g<1>', line)
   1.415 +                field = string.lower(field)
   1.416 +                data =  quotedata_rex.sub('\g<2>', line)
   1.417 +
   1.418 +            # field = data entries
   1.419 +            elif data_rex.match(line):
   1.420 +                field = field_rex.sub('\g<1>', line)
   1.421 +                field = string.lower(field)
   1.422 +                data =  data_rex.sub('\g<2>', line)
   1.423 +
   1.424 +            if field == 'url':
   1.425 +                data = '\\url{' + data.strip() + '}'
   1.426 +            
   1.427 +            if field in ('author', 'editor'):
   1.428 +                entrycont[field] = bibtexauthor(data)
   1.429 +                line = ''
   1.430 +            elif field == 'title':
   1.431 +                line = bibtextitle(data, entrytype)
   1.432 +            elif field != '':
   1.433 +                line = removebraces(transformurls(data.strip()))
   1.434 +
   1.435 +            if line != '':
   1.436 +                line = latexreplacements(line)
   1.437 +                entrycont[field] = line
   1.438 +
   1.439 +
   1.440 +    # sort entries
   1.441 +    filecont.sort(entry_cmp)
   1.442 +    
   1.443 +    # count the bibtex keys
   1.444 +    keytable = {}
   1.445 +    counttable = {}
   1.446 +    for entry in filecont:
   1.447 +        bibkey = entry[1]
   1.448 +        if not keytable.has_key(bibkey):
   1.449 +            keytable[bibkey] = 1
   1.450 +        else:
   1.451 +            keytable[bibkey] += 1
   1.452 +
   1.453 +    for bibkey in keytable.keys():
   1.454 +        counttable[bibkey] = 0
   1.455 +    
   1.456 +    # generate output
   1.457 +    for entry in filecont:
   1.458 +        # generate output key form the bibtex key
   1.459 +        bibkey = entry[1]
   1.460 +        entryid = entry[2]
   1.461 +        if keytable[bibkey] == 1:
   1.462 +            outkey = bibkey
   1.463 +        else:
   1.464 +            outkey = bibkey + chr(97 + counttable[bibkey])
   1.465 +        counttable[bibkey] += 1
   1.466 +        
   1.467 +        # append the entry code to the output
   1.468 +        file.append('\\section ' + entryid + ' [' + outkey + ']')
   1.469 +        file.append('<div style="' + divstyle + '">')
   1.470 +        for line in entry[3:]:
   1.471 +            file.append(line)
   1.472 +        file.append('</div>')
   1.473 +        file.append('')
   1.474 +
   1.475 +    return file
   1.476 +
   1.477 +
   1.478 +#
   1.479 +# return 1 iff abbr is in line but not inside braces or quotes
   1.480 +# assumes that abbr appears only once on the line (out of braces and quotes)
   1.481 +#
   1.482 +def verify_out_of_braces(line, abbr):
   1.483 +
   1.484 +    phrase_split = delimiter_rex.split(line)
   1.485 +
   1.486 +    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
   1.487 +
   1.488 +    open_brace = 0
   1.489 +    open_quote = 0
   1.490 +
   1.491 +    for phrase in phrase_split:
   1.492 +        if phrase == "{":
   1.493 +            open_brace = open_brace + 1
   1.494 +        elif phrase == "}":
   1.495 +            open_brace = open_brace - 1
   1.496 +        elif phrase == '"':
   1.497 +            if open_quote == 1:
   1.498 +                open_quote = 0
   1.499 +            else:
   1.500 +                open_quote = 1
   1.501 +        elif abbr_rex.search(phrase):
   1.502 +            if open_brace == 0 and open_quote == 0:
   1.503 +                return 1
   1.504 +
   1.505 +    return 0
   1.506 +
   1.507 +
   1.508 +#
   1.509 +# a line in the form phrase1 # phrase2 # ... # phrasen
   1.510 +# is returned as phrase1 phrase2 ... phrasen
   1.511 +# with the correct punctuation
   1.512 +# Bug: Doesn't always work with multiple abbreviations plugged in
   1.513 +#
   1.514 +def concat_line(line):
   1.515 +    # only look at part after equals
   1.516 +    field = field_rex.sub('\g<1>',line)
   1.517 +    rest = field_rex.sub('\g<2>',line)
   1.518 +
   1.519 +    concat_line = field + ' ='
   1.520 +
   1.521 +    pound_split = concatsplit_rex.split(rest)
   1.522 +
   1.523 +    phrase_count = 0
   1.524 +    length = len(pound_split)
   1.525 +
   1.526 +    for phrase in pound_split:
   1.527 +        phrase = phrase.strip()
   1.528 +        if phrase_count != 0:
   1.529 +            if phrase.startswith('"') or phrase.startswith('{'):
   1.530 +                phrase = phrase[1:]
   1.531 +        elif phrase.startswith('"'):
   1.532 +            phrase = phrase.replace('"','{',1)
   1.533 +
   1.534 +        if phrase_count != length-1:
   1.535 +            if phrase.endswith('"') or phrase.endswith('}'):
   1.536 +                phrase = phrase[:-1]
   1.537 +        else:
   1.538 +            if phrase.endswith('"'):
   1.539 +                phrase = phrase[:-1]
   1.540 +                phrase = phrase + "}"
   1.541 +            elif phrase.endswith('",'):
   1.542 +                phrase = phrase[:-2]
   1.543 +                phrase = phrase + "},"
   1.544 +
   1.545 +        # if phrase did have \#, add the \# back
   1.546 +        if phrase.endswith('\\'):
   1.547 +            phrase = phrase + "#"
   1.548 +        concat_line = concat_line + ' ' + phrase
   1.549 +
   1.550 +        phrase_count = phrase_count + 1
   1.551 +
   1.552 +    return concat_line
   1.553 +
   1.554 +
   1.555 +#
   1.556 +# substitute abbreviations into filecont
   1.557 +# @param filecont_source - string of data from file
   1.558 +#
   1.559 +def bibtex_replace_abbreviations(filecont_source):
   1.560 +    filecont = filecont_source.splitlines()
   1.561 +
   1.562 +    #  These are defined in bibtex, so we'll define them too
   1.563 +    abbr_list = ['jan','feb','mar','apr','may','jun',
   1.564 +                 'jul','aug','sep','oct','nov','dec']
   1.565 +    value_list = ['January','February','March','April',
   1.566 +                  'May','June','July','August','September',
   1.567 +                  'October','November','December']
   1.568 +
   1.569 +    abbr_rex = []
   1.570 +    total_abbr_count = 0
   1.571 +
   1.572 +    front = '\\b'
   1.573 +    back = '(,?)\\b'
   1.574 +
   1.575 +    for x in abbr_list:
   1.576 +        abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   1.577 +        total_abbr_count = total_abbr_count + 1
   1.578 +
   1.579 +
   1.580 +    abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
   1.581 +                             re.I)
   1.582 +
   1.583 +    comment_rex = re.compile('@comment\s*{',re.I)
   1.584 +    preamble_rex = re.compile('@preamble\s*{',re.I)
   1.585 +
   1.586 +    waiting_for_end_string = 0
   1.587 +    i = 0
   1.588 +    filecont2 = ''
   1.589 +
   1.590 +    for line in filecont:
   1.591 +        if line == ' ' or line == '':
   1.592 +            continue
   1.593 +
   1.594 +        if waiting_for_end_string:
   1.595 +            if re.search('}',line):
   1.596 +                waiting_for_end_string = 0
   1.597 +                continue
   1.598 +
   1.599 +        if abbrdef_rex.search(line):
   1.600 +            abbr = abbrdef_rex.sub('\g<1>', line)
   1.601 +
   1.602 +            if abbr_list.count(abbr) == 0:
   1.603 +                val = abbrdef_rex.sub('\g<2>', line)
   1.604 +                abbr_list.append(abbr)
   1.605 +                value_list.append(string.strip(val))
   1.606 +                abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   1.607 +                total_abbr_count = total_abbr_count + 1
   1.608 +            waiting_for_end_string = 1
   1.609 +            continue
   1.610 +
   1.611 +        if comment_rex.search(line):
   1.612 +            waiting_for_end_string = 1
   1.613 +            continue
   1.614 +
   1.615 +        if preamble_rex.search(line):
   1.616 +            waiting_for_end_string = 1
   1.617 +            continue
   1.618 +
   1.619 +
   1.620 +        # replace subsequent abbreviations with the value
   1.621 +        abbr_count = 0
   1.622 +
   1.623 +        for x in abbr_list:
   1.624 +
   1.625 +            if abbr_rex[abbr_count].search(line):
   1.626 +                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
   1.627 +                    line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
   1.628 +                # Check for # concatenations
   1.629 +                if concatsplit_rex.search(line):
   1.630 +                    line = concat_line(line)
   1.631 +            abbr_count = abbr_count + 1
   1.632 +
   1.633 +
   1.634 +        filecont2 = filecont2 + line + '\n'
   1.635 +        i = i+1
   1.636 +
   1.637 +
   1.638 +    # Do one final pass over file
   1.639 +
   1.640 +    # make sure that didn't end up with {" or }" after the substitution
   1.641 +    filecont2 = filecont2.replace('{"','{{')
   1.642 +    filecont2 = filecont2.replace('"}','}}')
   1.643 +
   1.644 +    afterquotevalue_rex = re.compile('"\s*,\s*')
   1.645 +    afterbrace_rex = re.compile('"\s*}')
   1.646 +    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
   1.647 +
   1.648 +    # add new lines to data that changed because of abbreviation substitutions
   1.649 +    filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
   1.650 +    filecont2 = afterbrace_rex.sub('"\n}', filecont2)
   1.651 +    filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
   1.652 +
   1.653 +    return filecont2
   1.654 +
   1.655 +#
   1.656 +# convert @type( ... ) to @type{ ... }
   1.657 +#
   1.658 +def no_outer_parens(filecont):
   1.659 +
   1.660 +    # do checking for open parens
   1.661 +    # will convert to braces
   1.662 +    paren_split = re.split('([(){}])',filecont)
   1.663 +
   1.664 +    open_paren_count = 0
   1.665 +    open_type = 0
   1.666 +    look_next = 0
   1.667 +
   1.668 +    # rebuild filecont
   1.669 +    filecont = ''
   1.670 +
   1.671 +    at_rex = re.compile('@\w*')
   1.672 +
   1.673 +    for phrase in paren_split:
   1.674 +        if look_next == 1:
   1.675 +            if phrase == '(':
   1.676 +                phrase = '{'
   1.677 +                open_paren_count = open_paren_count + 1
   1.678 +            else:
   1.679 +                open_type = 0
   1.680 +            look_next = 0
   1.681 +
   1.682 +        if phrase == '(':
   1.683 +            open_paren_count = open_paren_count + 1
   1.684 +
   1.685 +        elif phrase == ')':
   1.686 +            open_paren_count = open_paren_count - 1
   1.687 +            if open_type == 1 and open_paren_count == 0:
   1.688 +                phrase = '}'
   1.689 +                open_type = 0
   1.690 +
   1.691 +        elif at_rex.search( phrase ):
   1.692 +            open_type = 1
   1.693 +            look_next = 1
   1.694 +
   1.695 +        filecont = filecont + phrase
   1.696 +
   1.697 +    return filecont
   1.698 +
   1.699 +
   1.700 +#
   1.701 +# make all whitespace into just one space
   1.702 +# format the bibtex file into a usable form.
   1.703 +#
   1.704 +def bibtexwasher(filecont_source):
   1.705 +
   1.706 +    space_rex = re.compile('\s+')
   1.707 +    comment_rex = re.compile('\s*%')
   1.708 +
   1.709 +    filecont = []
   1.710 +
   1.711 +    # remove trailing and excessive whitespace
   1.712 +    # ignore comments
   1.713 +    for line in filecont_source:
   1.714 +        line = string.strip(line)
   1.715 +        line = space_rex.sub(' ', line)
   1.716 +        # ignore comments
   1.717 +        if not comment_rex.match(line) and line != '':
   1.718 +            filecont.append(' '+ line)
   1.719 +
   1.720 +    filecont = string.join(filecont, '')
   1.721 +
   1.722 +    # the file is in one long string
   1.723 +
   1.724 +    filecont = no_outer_parens(filecont)
   1.725 +
   1.726 +    #
   1.727 +    # split lines according to preferred syntax scheme
   1.728 +    #
   1.729 +    filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
   1.730 +
   1.731 +    # add new lines after commas that are after values
   1.732 +    filecont = re.sub('"\s*,', '",\n', filecont)
   1.733 +    filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
   1.734 +    filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
   1.735 +                          '\n\n\g<1>\g<2>,\n', filecont)
   1.736 +
   1.737 +    # add new lines after }
   1.738 +    filecont = re.sub('"\s*}','"\n}\n', filecont)
   1.739 +    filecont = re.sub('}\s*,','},\n', filecont)
   1.740 +
   1.741 +
   1.742 +    filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
   1.743 +
   1.744 +    # character encoding, reserved latex characters
   1.745 +    filecont = re.sub('{\\\&}', '&', filecont)
   1.746 +    filecont = re.sub('\\\&', '&', filecont)
   1.747 +
   1.748 +    # do checking for open braces to get format correct
   1.749 +    open_brace_count = 0
   1.750 +    brace_split = re.split('([{}])',filecont)
   1.751 +
   1.752 +    # rebuild filecont
   1.753 +    filecont = ''
   1.754 +
   1.755 +    for phrase in brace_split:
   1.756 +        if phrase == '{':
   1.757 +            open_brace_count = open_brace_count + 1
   1.758 +        elif phrase == '}':
   1.759 +            open_brace_count = open_brace_count - 1
   1.760 +            if open_brace_count == 0:
   1.761 +                filecont = filecont + '\n'
   1.762 +
   1.763 +        filecont = filecont + phrase
   1.764 +
   1.765 +    filecont2 = bibtex_replace_abbreviations(filecont)
   1.766 +
   1.767 +    # gather
   1.768 +    filecont = filecont2.splitlines()
   1.769 +    i=0
   1.770 +    j=0         # count the number of blank lines
   1.771 +    for line in filecont:
   1.772 +        # ignore blank lines
   1.773 +        if line == '' or line == ' ':
   1.774 +            j = j+1
   1.775 +            continue
   1.776 +        filecont[i] = line + '\n'
   1.777 +        i = i+1
   1.778 +
   1.779 +    # get rid of the extra stuff at the end of the array
   1.780 +    # (The extra stuff are duplicates that are in the array because
   1.781 +    # blank lines were removed.)
   1.782 +    length = len( filecont)
   1.783 +    filecont[length-j:length] = []
   1.784 +
   1.785 +    return filecont
   1.786 +
   1.787 +
   1.788 +def filehandler(filepath):
   1.789 +    try:
   1.790 +        fd = open(filepath, 'r')
   1.791 +        filecont_source = fd.readlines()
   1.792 +        fd.close()
   1.793 +    except:
   1.794 +        print 'Could not open file:', filepath
   1.795 +    washeddata = bibtexwasher(filecont_source)
   1.796 +    outdata = bibtexdecoder(washeddata)
   1.797 +    print '/**'
   1.798 +    print '\page references References'
   1.799 +    print
   1.800 +    for line in outdata:
   1.801 +        print line
   1.802 +    print '*/'
   1.803 +
   1.804 +
   1.805 +# main program
   1.806 +
   1.807 +def main():
   1.808 +    import sys
   1.809 +    if sys.argv[1:]:
   1.810 +        filepath = sys.argv[1]
   1.811 +    else:
   1.812 +        print "No input file"
   1.813 +        sys.exit()
   1.814 +    filehandler(filepath)
   1.815 +
   1.816 +if __name__ == "__main__": main()
   1.817 +
   1.818 +
   1.819 +# end python script
changeset 1284	ad40f7d32846
parent 801	2de0fc630899
child 1220	eb2f9d453070