1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/scripts/bib2dox.py	Thu Nov 05 15:48:01 2009 +0100
     1.3 @@ -0,0 +1,811 @@
     1.4 +#!/usr/bin/env /usr/local/Python/bin/python2.1
     1.5 +"""
     1.6 +  BibTeX to Doxygen converter
     1.7 +  Usage: python bib2dox.py bibfile.bib > bibfile.dox
     1.8 +
     1.9 +  This code is the modification of the BibTeX to XML converter
    1.10 +  by Vidar Bronken Gundersen et al. See the original copyright notices below. 
    1.11 +
    1.12 +  **********************************************************************
    1.13 +
    1.14 +  Decoder for bibliographic data, BibTeX
    1.15 +  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
    1.16 +
    1.17 +  v.8
    1.18 +  (c)2002-06-23 Vidar Bronken Gundersen
    1.19 +  http://bibtexml.sf.net/
    1.20 +  Reuse approved as long as this notification is kept.
    1.21 +  Licence: GPL.
    1.22 +
    1.23 +  Contributions/thanks to:
    1.24 +  Egon Willighagen, http://sf.net/projects/jreferences/
    1.25 +  Richard Mahoney (for providing a test case)
    1.26 +
    1.27 +  Editted by Sara Sprenkle to be more robust and handle more bibtex features.
    1.28 +  (c) 2003-01-15
    1.29 +
    1.30 +  1.  Changed bibtex: tags to bibxml: tags.
    1.31 +  2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
    1.32 +  3.  Allow spaces between @type and first {
    1.33 +  4.  "author" fields with multiple authors split by " and "
    1.34 +      are put in separate xml "bibxml:author" tags.
    1.35 +  5.  Option for Titles: words are capitalized
    1.36 +      only if first letter in title or capitalized inside braces
    1.37 +  6.  Removes braces from within field values
    1.38 +  7.  Ignores comments in bibtex file (including @comment{ or % )
    1.39 +  8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
    1.40 +  9.  Handles bibtex @string abbreviations
    1.41 +        --> includes bibtex's default abbreviations for months
    1.42 +        --> does concatenation of abbr # " more " and " more " # abbr
    1.43 +  10. Handles @type( ... ) or @type{ ... }
    1.44 +  11. The keywords field is split on , or ; and put into separate xml
    1.45 +      "bibxml:keywords" tags
    1.46 +  12. Ignores @preamble
    1.47 +
    1.48 +  Known Limitations
    1.49 +  1.  Does not transform Latex encoding like math mode and special
    1.50 +      latex symbols.
    1.51 +  2.  Does not parse author fields into first and last names.
    1.52 +      E.g., It does not do anything special to an author whose name is
    1.53 +      in the form LAST_NAME, FIRST_NAME
    1.54 +      In "author" tag, will show up as
    1.55 +      <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
    1.56 +  3.  Does not handle "crossref" fields other than to print
    1.57 +      <bibxml:crossref>...</bibxml:crossref>
    1.58 +  4.  Does not inform user of the input's format errors.  You just won't
    1.59 +      be able to transform the file later with XSL
    1.60 +
    1.61 +  You will have to manually edit the XML output if you need to handle
    1.62 +  these (and unknown) limitations.
    1.63 +
    1.64 +"""
    1.65 +
    1.66 +import string, re
    1.67 +
    1.68 +# set of valid name characters
    1.69 +valid_name_chars = '[\w\-:]'
    1.70 +
    1.71 +#
    1.72 +# define global regular expression variables
    1.73 +#
    1.74 +author_rex = re.compile('\s+and\s+')
    1.75 +rembraces_rex = re.compile('[{}]')
    1.76 +capitalize_rex = re.compile('({[^}]*})')
    1.77 +
    1.78 +# used by bibtexkeywords(data)
    1.79 +keywords_rex = re.compile('[,;]')
    1.80 +
    1.81 +# used by concat_line(line)
    1.82 +concatsplit_rex = re.compile('\s*#\s*')
    1.83 +
    1.84 +# split on {, }, or " in verify_out_of_braces
    1.85 +delimiter_rex = re.compile('([{}"])',re.I)
    1.86 +
    1.87 +field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
    1.88 +data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
    1.89 +
    1.90 +url_rex = re.compile('\\\url\{([^}]*)\}')
    1.91 +
    1.92 +#
    1.93 +# styles for html formatting
    1.94 +#
    1.95 +divstyle = 'margin-top: -4ex; margin-left: 8em;'
    1.96 +
    1.97 +#
    1.98 +# return the string parameter without braces
    1.99 +#
   1.100 +def transformurls(str):
   1.101 +    return url_rex.sub(r'<a href="\1">\1</a>', str)
   1.102 +
   1.103 +#
   1.104 +# return the string parameter without braces
   1.105 +#
   1.106 +def removebraces(str):
   1.107 +    return rembraces_rex.sub('', str)
   1.108 +
   1.109 +#
   1.110 +# latex-specific replacements
   1.111 +# (do this after braces were removed)
   1.112 +#
   1.113 +def latexreplacements(line):
   1.114 +    line = string.replace(line, '~', '&nbsp;')
   1.115 +    line = string.replace(line, '\\\'a', '&aacute;')
   1.116 +    line = string.replace(line, '\\"a', '&auml;')
   1.117 +    line = string.replace(line, '\\\'e', '&eacute;')
   1.118 +    line = string.replace(line, '\\"e', '&euml;')
   1.119 +    line = string.replace(line, '\\\'i', '&iacute;')
   1.120 +    line = string.replace(line, '\\"i', '&iuml;')
   1.121 +    line = string.replace(line, '\\\'o', '&oacute;')
   1.122 +    line = string.replace(line, '\\"o', '&ouml;')
   1.123 +    line = string.replace(line, '\\\'u', '&uacute;')
   1.124 +    line = string.replace(line, '\\"u', '&uuml;')
   1.125 +    line = string.replace(line, '\\H o', '&otilde;')
   1.126 +    line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
   1.127 +    line = string.replace(line, '\\\'A', '&Aacute;')
   1.128 +    line = string.replace(line, '\\"A', '&Auml;')
   1.129 +    line = string.replace(line, '\\\'E', '&Eacute;')
   1.130 +    line = string.replace(line, '\\"E', '&Euml;')
   1.131 +    line = string.replace(line, '\\\'I', '&Iacute;')
   1.132 +    line = string.replace(line, '\\"I', '&Iuml;')
   1.133 +    line = string.replace(line, '\\\'O', '&Oacute;')
   1.134 +    line = string.replace(line, '\\"O', '&Ouml;')
   1.135 +    line = string.replace(line, '\\\'U', '&Uacute;')
   1.136 +    line = string.replace(line, '\\"U', '&Uuml;')
   1.137 +    line = string.replace(line, '\\H O', '&Otilde;')
   1.138 +    line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
   1.139 +
   1.140 +    return line
   1.141 +
   1.142 +#
   1.143 +# copy characters form a string decoding html expressions (&xyz;)
   1.144 +#
   1.145 +def copychars(str, ifrom, count):
   1.146 +    result = ''
   1.147 +    i = ifrom
   1.148 +    c = 0
   1.149 +    html_spec = False
   1.150 +    while (i < len(str)) and (c < count):
   1.151 +        if str[i] == '&':
   1.152 +            html_spec = True;
   1.153 +            if i+1 < len(str):
   1.154 +                result += str[i+1]
   1.155 +            c += 1
   1.156 +            i += 2
   1.157 +        else:
   1.158 +            if not html_spec:
   1.159 +                if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
   1.160 +                   ((str[i] >= 'a') and (str[i] <= 'z')):
   1.161 +                    result += str[i]
   1.162 +                    c += 1
   1.163 +            elif str[i] == ';':
   1.164 +                html_spec = False;
   1.165 +            i += 1
   1.166 +    
   1.167 +    return result
   1.168 +
   1.169 +
   1.170 +# 
   1.171 +# Handle a list of authors (separated by 'and').
   1.172 +# It gives back an array of the follwing values:
   1.173 +#  - num: the number of authors,
   1.174 +#  - list: the list of the author names,
   1.175 +#  - text: the bibtex text (separated by commas and/or 'and')
   1.176 +#  - abbrev: abbreviation that can be used for indicate the
   1.177 +#    bibliography entries
   1.178 +#
   1.179 +def bibtexauthor(data):
   1.180 +    result = {}
   1.181 +    bibtex = ''
   1.182 +    result['list'] = author_rex.split(data)
   1.183 +    result['num'] = len(result['list'])
   1.184 +    for i, author in enumerate(result['list']):
   1.185 +        # general transformations
   1.186 +        author = latexreplacements(removebraces(author.strip()))
   1.187 +        # transform "Xyz, A. B." to "A. B. Xyz"
   1.188 +        pos = author.find(',')
   1.189 +        if pos != -1:
   1.190 +            author = author[pos+1:].strip() + ' ' + author[:pos].strip()
   1.191 +        result['list'][i] = author
   1.192 +        bibtex += author + '#'
   1.193 +    bibtex = bibtex[:-1]
   1.194 +    if result['num'] > 1:
   1.195 +        ix = bibtex.rfind('#')
   1.196 +        if result['num'] == 2:
   1.197 +            bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
   1.198 +        else:
   1.199 +            bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
   1.200 +    bibtex = bibtex.replace('#', ', ')
   1.201 +    result['text'] = bibtex
   1.202 +    
   1.203 +    result['abbrev'] = ''
   1.204 +    for author in result['list']:
   1.205 +        pos = author.rfind(' ') + 1
   1.206 +        count = 1
   1.207 +        if result['num'] == 1:
   1.208 +            count = 3
   1.209 +        result['abbrev'] += copychars(author, pos, count)
   1.210 +
   1.211 +    return result
   1.212 +
   1.213 +
   1.214 +#
   1.215 +# data = title string
   1.216 +# @return the capitalized title (first letter is capitalized), rest are capitalized
   1.217 +# only if capitalized inside braces
   1.218 +#
   1.219 +def capitalizetitle(data):
   1.220 +    title_list = capitalize_rex.split(data)
   1.221 +    title = ''
   1.222 +    count = 0
   1.223 +    for phrase in title_list:
   1.224 +         check = string.lstrip(phrase)
   1.225 +
   1.226 +         # keep phrase's capitalization the same
   1.227 +         if check.find('{') == 0:
   1.228 +              title += removebraces(phrase)
   1.229 +         else:
   1.230 +         # first word --> capitalize first letter (after spaces)
   1.231 +              if count == 0:
   1.232 +                  title += check.capitalize()
   1.233 +              else:
   1.234 +                  title += phrase.lower()
   1.235 +         count = count + 1
   1.236 +
   1.237 +    return title
   1.238 +
   1.239 +
   1.240 +#
   1.241 +# @return the bibtex for the title
   1.242 +# @param data --> title string
   1.243 +# braces are removed from title
   1.244 +#
   1.245 +def bibtextitle(data, entrytype):
   1.246 +    if entrytype in ('book', 'inbook'):
   1.247 +        title = removebraces(data.strip())
   1.248 +    else:
   1.249 +        title = removebraces(capitalizetitle(data.strip()))
   1.250 +    bibtex = title
   1.251 +    return bibtex
   1.252 +
   1.253 +
   1.254 +#
   1.255 +# function to compare entry lists
   1.256 +#
   1.257 +def entry_cmp(x, y):
   1.258 +    return cmp(x[0], y[0])
   1.259 +
   1.260 +
   1.261 +#
   1.262 +# print the XML for the transformed "filecont_source"
   1.263 +#
   1.264 +def bibtexdecoder(filecont_source):
   1.265 +    filecont = []
   1.266 +    file = []
   1.267 +    
   1.268 +    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   1.269 +    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
   1.270 +    endtype_rex = re.compile('}\s*$')
   1.271 +    endtag_rex = re.compile('^\s*}\s*$')
   1.272 +
   1.273 +    bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   1.274 +    bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
   1.275 +
   1.276 +    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   1.277 +    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
   1.278 +
   1.279 +    for line in filecont_source:
   1.280 +        line = line[:-1]
   1.281 +
   1.282 +        # encode character entities
   1.283 +        line = string.replace(line, '&', '&amp;')
   1.284 +        line = string.replace(line, '<', '&lt;')
   1.285 +        line = string.replace(line, '>', '&gt;')
   1.286 +
   1.287 +        # start entry: publication type (store for later use)
   1.288 +        if pubtype_rex.match(line):
   1.289 +        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   1.290 +            entrycont = {}
   1.291 +            entry = []
   1.292 +            entrytype = pubtype_rex.sub('\g<1>',line)
   1.293 +            entrytype = string.lower(entrytype)
   1.294 +            entryid   = pubtype_rex.sub('\g<2>', line)
   1.295 +
   1.296 +        # end entry if just a }
   1.297 +        elif endtype_rex.match(line):
   1.298 +            # generate doxygen code for the entry
   1.299 +
   1.300 +            # enty type related formattings
   1.301 +            if entrytype in ('book', 'inbook'):
   1.302 +                entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
   1.303 +                if not entrycont.has_key('author'):
   1.304 +                    entrycont['author'] = entrycont['editor']
   1.305 +                    entrycont['author']['text'] += ', editors'
   1.306 +            elif entrytype == 'article':
   1.307 +                entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
   1.308 +            elif entrytype in ('inproceedings', 'incollection', 'conference'):
   1.309 +                entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
   1.310 +            elif entrytype == 'techreport':
   1.311 +                if not entrycont.has_key('type'):
   1.312 +                    entrycont['type'] = 'Technical report'
   1.313 +            elif entrytype == 'mastersthesis':
   1.314 +                entrycont['type'] = 'Master\'s thesis'
   1.315 +            elif entrytype == 'phdthesis':
   1.316 +                entrycont['type'] = 'PhD thesis'
   1.317 +
   1.318 +            for eline in entrycont:
   1.319 +                if eline != '':
   1.320 +                    eline = latexreplacements(eline)
   1.321 +
   1.322 +            if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.323 +                entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
   1.324 +
   1.325 +            if entrycont.has_key('author') and (entrycont['author'] != ''):
   1.326 +                entry.append(entrycont['author']['text'] + '.')
   1.327 +            if entrycont.has_key('title') and (entrycont['title'] != ''):
   1.328 +                entry.append(entrycont['title'] + '.')
   1.329 +            if entrycont.has_key('journal') and (entrycont['journal'] != ''):
   1.330 +                entry.append(entrycont['journal'] + ',')
   1.331 +            if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
   1.332 +                entry.append('In ' + entrycont['booktitle'] + ',')
   1.333 +            if entrycont.has_key('type') and (entrycont['type'] != ''):
   1.334 +                eline = entrycont['type']
   1.335 +                if entrycont.has_key('number') and (entrycont['number'] != ''):
   1.336 +                    eline += ' ' + entrycont['number']
   1.337 +                eline += ','
   1.338 +                entry.append(eline)
   1.339 +            if entrycont.has_key('institution') and (entrycont['institution'] != ''):
   1.340 +                entry.append(entrycont['institution'] + ',')
   1.341 +            if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
   1.342 +                entry.append(entrycont['publisher'] + ',')
   1.343 +            if entrycont.has_key('school') and (entrycont['school'] != ''):
   1.344 +                entry.append(entrycont['school'] + ',')
   1.345 +            if entrycont.has_key('address') and (entrycont['address'] != ''):
   1.346 +                entry.append(entrycont['address'] + ',')
   1.347 +            if entrycont.has_key('edition') and (entrycont['edition'] != ''):
   1.348 +                entry.append(entrycont['edition'] + ' edition,')
   1.349 +            if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
   1.350 +                entry.append(entrycont['howpublished'] + ',')
   1.351 +            if entrycont.has_key('volume') and (entrycont['volume'] != ''):
   1.352 +                eline = entrycont['volume'];
   1.353 +                if entrycont.has_key('number') and (entrycont['number'] != ''):
   1.354 +                    eline += '(' + entrycont['number'] + ')'
   1.355 +                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.356 +                    eline += ':' + entrycont['pages']
   1.357 +                eline += ','
   1.358 +                entry.append(eline)
   1.359 +            else:
   1.360 +                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   1.361 +                    entry.append('pages ' + entrycont['pages'] + ',')
   1.362 +            if entrycont.has_key('year') and (entrycont['year'] != ''):
   1.363 +                if entrycont.has_key('month') and (entrycont['month'] != ''):
   1.364 +                    entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
   1.365 +                else:
   1.366 +                    entry.append(entrycont['year'] + '.')
   1.367 +            if entrycont.has_key('note') and (entrycont['note'] != ''):
   1.368 +                entry.append(entrycont['note'] + '.')
   1.369 +            if entrycont.has_key('url') and (entrycont['url'] != ''):
   1.370 +                entry.append(entrycont['url'] + '.')
   1.371 +
   1.372 +            # generate keys for sorting and for the output
   1.373 +            sortkey = ''
   1.374 +            bibkey = ''
   1.375 +            if entrycont.has_key('author'):
   1.376 +                for author in entrycont['author']['list']:
   1.377 +                    sortkey += copychars(author, author.rfind(' ')+1, len(author))
   1.378 +                bibkey = entrycont['author']['abbrev']
   1.379 +            else:
   1.380 +                bibkey = 'x'
   1.381 +            if entrycont.has_key('year'):
   1.382 +                sortkey += entrycont['year']
   1.383 +                bibkey += entrycont['year'][-2:]
   1.384 +            if entrycont.has_key('title'):
   1.385 +                sortkey += entrycont['title']
   1.386 +            if entrycont.has_key('key'):
   1.387 +                sortkey = entrycont['key'] + sortkey
   1.388 +                bibkey = entrycont['key']
   1.389 +            entry.insert(0, sortkey)
   1.390 +            entry.insert(1, bibkey)
   1.391 +            entry.insert(2, entryid)
   1.392 +           
   1.393 +            # add the entry to the file contents
   1.394 +            filecont.append(entry)
   1.395 +
   1.396 +        else:
   1.397 +            # field, publication info
   1.398 +            field = ''
   1.399 +            data = ''
   1.400 +            
   1.401 +            # field = {data} entries
   1.402 +            if bracedata_rex.match(line):
   1.403 +                field = bracefield_rex.sub('\g<1>', line)
   1.404 +                field = string.lower(field)
   1.405 +                data =  bracedata_rex.sub('\g<2>', line)
   1.406 +
   1.407 +            # field = "data" entries
   1.408 +            elif quotedata_rex.match(line):
   1.409 +                field = quotefield_rex.sub('\g<1>', line)
   1.410 +                field = string.lower(field)
   1.411 +                data =  quotedata_rex.sub('\g<2>', line)
   1.412 +
   1.413 +            # field = data entries
   1.414 +            elif data_rex.match(line):
   1.415 +                field = field_rex.sub('\g<1>', line)
   1.416 +                field = string.lower(field)
   1.417 +                data =  data_rex.sub('\g<2>', line)
   1.418 +
   1.419 +            if field == 'url':
   1.420 +                data = '\\url{' + data.strip() + '}'
   1.421 +            
   1.422 +            if field in ('author', 'editor'):
   1.423 +                entrycont[field] = bibtexauthor(data)
   1.424 +                line = ''
   1.425 +            elif field == 'title':
   1.426 +                line = bibtextitle(data, entrytype)
   1.427 +            elif field != '':
   1.428 +                line = removebraces(transformurls(data.strip()))
   1.429 +
   1.430 +            if line != '':
   1.431 +                line = latexreplacements(line)
   1.432 +                entrycont[field] = line
   1.433 +
   1.434 +
   1.435 +    # sort entries
   1.436 +    filecont.sort(entry_cmp)
   1.437 +    
   1.438 +    # count the bibtex keys
   1.439 +    keytable = {}
   1.440 +    counttable = {}
   1.441 +    for entry in filecont:
   1.442 +        bibkey = entry[1]
   1.443 +        if not keytable.has_key(bibkey):
   1.444 +            keytable[bibkey] = 1
   1.445 +        else:
   1.446 +            keytable[bibkey] += 1
   1.447 +
   1.448 +    for bibkey in keytable.keys():
   1.449 +        counttable[bibkey] = 0
   1.450 +    
   1.451 +    # generate output
   1.452 +    for entry in filecont:
   1.453 +        # generate output key form the bibtex key
   1.454 +        bibkey = entry[1]
   1.455 +        entryid = entry[2]
   1.456 +        if keytable[bibkey] == 1:
   1.457 +            outkey = bibkey
   1.458 +        else:
   1.459 +            outkey = bibkey + chr(97 + counttable[bibkey])
   1.460 +        counttable[bibkey] += 1
   1.461 +        
   1.462 +        # append the entry code to the output
   1.463 +        file.append('\\section ' + entryid + ' [' + outkey + ']')
   1.464 +        file.append('<div style="' + divstyle + '">')
   1.465 +        for line in entry[3:]:
   1.466 +            file.append(line)
   1.467 +        file.append('</div>')
   1.468 +        file.append('')
   1.469 +
   1.470 +    return file
   1.471 +
   1.472 +
   1.473 +#
   1.474 +# return 1 iff abbr is in line but not inside braces or quotes
   1.475 +# assumes that abbr appears only once on the line (out of braces and quotes)
   1.476 +#
   1.477 +def verify_out_of_braces(line, abbr):
   1.478 +
   1.479 +    phrase_split = delimiter_rex.split(line)
   1.480 +
   1.481 +    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
   1.482 +
   1.483 +    open_brace = 0
   1.484 +    open_quote = 0
   1.485 +
   1.486 +    for phrase in phrase_split:
   1.487 +        if phrase == "{":
   1.488 +            open_brace = open_brace + 1
   1.489 +        elif phrase == "}":
   1.490 +            open_brace = open_brace - 1
   1.491 +        elif phrase == '"':
   1.492 +            if open_quote == 1:
   1.493 +                open_quote = 0
   1.494 +            else:
   1.495 +                open_quote = 1
   1.496 +        elif abbr_rex.search(phrase):
   1.497 +            if open_brace == 0 and open_quote == 0:
   1.498 +                return 1
   1.499 +
   1.500 +    return 0
   1.501 +
   1.502 +
   1.503 +#
   1.504 +# a line in the form phrase1 # phrase2 # ... # phrasen
   1.505 +# is returned as phrase1 phrase2 ... phrasen
   1.506 +# with the correct punctuation
   1.507 +# Bug: Doesn't always work with multiple abbreviations plugged in
   1.508 +#
   1.509 +def concat_line(line):
   1.510 +    # only look at part after equals
   1.511 +    field = field_rex.sub('\g<1>',line)
   1.512 +    rest = field_rex.sub('\g<2>',line)
   1.513 +
   1.514 +    concat_line = field + ' ='
   1.515 +
   1.516 +    pound_split = concatsplit_rex.split(rest)
   1.517 +
   1.518 +    phrase_count = 0
   1.519 +    length = len(pound_split)
   1.520 +
   1.521 +    for phrase in pound_split:
   1.522 +        phrase = phrase.strip()
   1.523 +        if phrase_count != 0:
   1.524 +            if phrase.startswith('"') or phrase.startswith('{'):
   1.525 +                phrase = phrase[1:]
   1.526 +        elif phrase.startswith('"'):
   1.527 +            phrase = phrase.replace('"','{',1)
   1.528 +
   1.529 +        if phrase_count != length-1:
   1.530 +            if phrase.endswith('"') or phrase.endswith('}'):
   1.531 +                phrase = phrase[:-1]
   1.532 +        else:
   1.533 +            if phrase.endswith('"'):
   1.534 +                phrase = phrase[:-1]
   1.535 +                phrase = phrase + "}"
   1.536 +            elif phrase.endswith('",'):
   1.537 +                phrase = phrase[:-2]
   1.538 +                phrase = phrase + "},"
   1.539 +
   1.540 +        # if phrase did have \#, add the \# back
   1.541 +        if phrase.endswith('\\'):
   1.542 +            phrase = phrase + "#"
   1.543 +        concat_line = concat_line + ' ' + phrase
   1.544 +
   1.545 +        phrase_count = phrase_count + 1
   1.546 +
   1.547 +    return concat_line
   1.548 +
   1.549 +
   1.550 +#
   1.551 +# substitute abbreviations into filecont
   1.552 +# @param filecont_source - string of data from file
   1.553 +#
   1.554 +def bibtex_replace_abbreviations(filecont_source):
   1.555 +    filecont = filecont_source.splitlines()
   1.556 +
   1.557 +    #  These are defined in bibtex, so we'll define them too
   1.558 +    abbr_list = ['jan','feb','mar','apr','may','jun',
   1.559 +                 'jul','aug','sep','oct','nov','dec']
   1.560 +    value_list = ['January','February','March','April',
   1.561 +                  'May','June','July','August','September',
   1.562 +                  'October','November','December']
   1.563 +
   1.564 +    abbr_rex = []
   1.565 +    total_abbr_count = 0
   1.566 +
   1.567 +    front = '\\b'
   1.568 +    back = '(,?)\\b'
   1.569 +
   1.570 +    for x in abbr_list:
   1.571 +        abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   1.572 +        total_abbr_count = total_abbr_count + 1
   1.573 +
   1.574 +
   1.575 +    abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
   1.576 +                             re.I)
   1.577 +
   1.578 +    comment_rex = re.compile('@comment\s*{',re.I)
   1.579 +    preamble_rex = re.compile('@preamble\s*{',re.I)
   1.580 +
   1.581 +    waiting_for_end_string = 0
   1.582 +    i = 0
   1.583 +    filecont2 = ''
   1.584 +
   1.585 +    for line in filecont:
   1.586 +        if line == ' ' or line == '':
   1.587 +            continue
   1.588 +
   1.589 +        if waiting_for_end_string:
   1.590 +            if re.search('}',line):
   1.591 +                waiting_for_end_string = 0
   1.592 +                continue
   1.593 +
   1.594 +        if abbrdef_rex.search(line):
   1.595 +            abbr = abbrdef_rex.sub('\g<1>', line)
   1.596 +
   1.597 +            if abbr_list.count(abbr) == 0:
   1.598 +                val = abbrdef_rex.sub('\g<2>', line)
   1.599 +                abbr_list.append(abbr)
   1.600 +                value_list.append(string.strip(val))
   1.601 +                abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   1.602 +                total_abbr_count = total_abbr_count + 1
   1.603 +            waiting_for_end_string = 1
   1.604 +            continue
   1.605 +
   1.606 +        if comment_rex.search(line):
   1.607 +            waiting_for_end_string = 1
   1.608 +            continue
   1.609 +
   1.610 +        if preamble_rex.search(line):
   1.611 +            waiting_for_end_string = 1
   1.612 +            continue
   1.613 +
   1.614 +
   1.615 +        # replace subsequent abbreviations with the value
   1.616 +        abbr_count = 0
   1.617 +
   1.618 +        for x in abbr_list:
   1.619 +
   1.620 +            if abbr_rex[abbr_count].search(line):
   1.621 +                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
   1.622 +                    line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
   1.623 +                # Check for # concatenations
   1.624 +                if concatsplit_rex.search(line):
   1.625 +                    line = concat_line(line)
   1.626 +            abbr_count = abbr_count + 1
   1.627 +
   1.628 +
   1.629 +        filecont2 = filecont2 + line + '\n'
   1.630 +        i = i+1
   1.631 +
   1.632 +
   1.633 +    # Do one final pass over file
   1.634 +
   1.635 +    # make sure that didn't end up with {" or }" after the substitution
   1.636 +    filecont2 = filecont2.replace('{"','{{')
   1.637 +    filecont2 = filecont2.replace('"}','}}')
   1.638 +
   1.639 +    afterquotevalue_rex = re.compile('"\s*,\s*')
   1.640 +    afterbrace_rex = re.compile('"\s*}')
   1.641 +    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
   1.642 +
   1.643 +    # add new lines to data that changed because of abbreviation substitutions
   1.644 +    filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
   1.645 +    filecont2 = afterbrace_rex.sub('"\n}', filecont2)
   1.646 +    filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
   1.647 +
   1.648 +    return filecont2
   1.649 +
   1.650 +#
   1.651 +# convert @type( ... ) to @type{ ... }
   1.652 +#
   1.653 +def no_outer_parens(filecont):
   1.654 +
   1.655 +    # do checking for open parens
   1.656 +    # will convert to braces
   1.657 +    paren_split = re.split('([(){}])',filecont)
   1.658 +
   1.659 +    open_paren_count = 0
   1.660 +    open_type = 0
   1.661 +    look_next = 0
   1.662 +
   1.663 +    # rebuild filecont
   1.664 +    filecont = ''
   1.665 +
   1.666 +    at_rex = re.compile('@\w*')
   1.667 +
   1.668 +    for phrase in paren_split:
   1.669 +        if look_next == 1:
   1.670 +            if phrase == '(':
   1.671 +                phrase = '{'
   1.672 +                open_paren_count = open_paren_count + 1
   1.673 +            else:
   1.674 +                open_type = 0
   1.675 +            look_next = 0
   1.676 +
   1.677 +        if phrase == '(':
   1.678 +            open_paren_count = open_paren_count + 1
   1.679 +
   1.680 +        elif phrase == ')':
   1.681 +            open_paren_count = open_paren_count - 1
   1.682 +            if open_type == 1 and open_paren_count == 0:
   1.683 +                phrase = '}'
   1.684 +                open_type = 0
   1.685 +
   1.686 +        elif at_rex.search( phrase ):
   1.687 +            open_type = 1
   1.688 +            look_next = 1
   1.689 +
   1.690 +        filecont = filecont + phrase
   1.691 +
   1.692 +    return filecont
   1.693 +
   1.694 +
   1.695 +#
   1.696 +# make all whitespace into just one space
   1.697 +# format the bibtex file into a usable form.
   1.698 +#
   1.699 +def bibtexwasher(filecont_source):
   1.700 +
   1.701 +    space_rex = re.compile('\s+')
   1.702 +    comment_rex = re.compile('\s*%')
   1.703 +
   1.704 +    filecont = []
   1.705 +
   1.706 +    # remove trailing and excessive whitespace
   1.707 +    # ignore comments
   1.708 +    for line in filecont_source:
   1.709 +        line = string.strip(line)
   1.710 +        line = space_rex.sub(' ', line)
   1.711 +        # ignore comments
   1.712 +        if not comment_rex.match(line) and line != '':
   1.713 +            filecont.append(' '+ line)
   1.714 +
   1.715 +    filecont = string.join(filecont, '')
   1.716 +
   1.717 +    # the file is in one long string
   1.718 +
   1.719 +    filecont = no_outer_parens(filecont)
   1.720 +
   1.721 +    #
   1.722 +    # split lines according to preferred syntax scheme
   1.723 +    #
   1.724 +    filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
   1.725 +
   1.726 +    # add new lines after commas that are after values
   1.727 +    filecont = re.sub('"\s*,', '",\n', filecont)
   1.728 +    filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
   1.729 +    filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
   1.730 +                          '\n\n\g<1>\g<2>,\n', filecont)
   1.731 +
   1.732 +    # add new lines after }
   1.733 +    filecont = re.sub('"\s*}','"\n}\n', filecont)
   1.734 +    filecont = re.sub('}\s*,','},\n', filecont)
   1.735 +
   1.736 +
   1.737 +    filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
   1.738 +
   1.739 +    # character encoding, reserved latex characters
   1.740 +    filecont = re.sub('{\\\&}', '&', filecont)
   1.741 +    filecont = re.sub('\\\&', '&', filecont)
   1.742 +
   1.743 +    # do checking for open braces to get format correct
   1.744 +    open_brace_count = 0
   1.745 +    brace_split = re.split('([{}])',filecont)
   1.746 +
   1.747 +    # rebuild filecont
   1.748 +    filecont = ''
   1.749 +
   1.750 +    for phrase in brace_split:
   1.751 +        if phrase == '{':
   1.752 +            open_brace_count = open_brace_count + 1
   1.753 +        elif phrase == '}':
   1.754 +            open_brace_count = open_brace_count - 1
   1.755 +            if open_brace_count == 0:
   1.756 +                filecont = filecont + '\n'
   1.757 +
   1.758 +        filecont = filecont + phrase
   1.759 +
   1.760 +    filecont2 = bibtex_replace_abbreviations(filecont)
   1.761 +
   1.762 +    # gather
   1.763 +    filecont = filecont2.splitlines()
   1.764 +    i=0
   1.765 +    j=0         # count the number of blank lines
   1.766 +    for line in filecont:
   1.767 +        # ignore blank lines
   1.768 +        if line == '' or line == ' ':
   1.769 +            j = j+1
   1.770 +            continue
   1.771 +        filecont[i] = line + '\n'
   1.772 +        i = i+1
   1.773 +
   1.774 +    # get rid of the extra stuff at the end of the array
   1.775 +    # (The extra stuff are duplicates that are in the array because
   1.776 +    # blank lines were removed.)
   1.777 +    length = len( filecont)
   1.778 +    filecont[length-j:length] = []
   1.779 +
   1.780 +    return filecont
   1.781 +
   1.782 +
   1.783 +def filehandler(filepath):
   1.784 +    try:
   1.785 +        fd = open(filepath, 'r')
   1.786 +        filecont_source = fd.readlines()
   1.787 +        fd.close()
   1.788 +    except:
   1.789 +        print 'Could not open file:', filepath
   1.790 +    washeddata = bibtexwasher(filecont_source)
   1.791 +    outdata = bibtexdecoder(washeddata)
   1.792 +    print '/**'
   1.793 +    print '\page references References'
   1.794 +    print
   1.795 +    for line in outdata:
   1.796 +        print line
   1.797 +    print '*/'
   1.798 +
   1.799 +
   1.800 +# main program
   1.801 +
   1.802 +def main():
   1.803 +    import sys
   1.804 +    if sys.argv[1:]:
   1.805 +        filepath = sys.argv[1]
   1.806 +    else:
   1.807 +        print "No input file"
   1.808 +        sys.exit()
   1.809 +    filehandler(filepath)
   1.810 +
   1.811 +if __name__ == "__main__": main()
   1.812 +
   1.813 +
   1.814 +# end python script
changeset 830	ef88c0a30f85
parent 792	68792fb2870f
child 905	c841ae1aca29