scripts/bib2dox.py
author Peter Kovacs <kpeter@inf.elte.hu>
Sat, 26 Sep 2009 10:15:49 +0200
changeset 790 94ef0a5c0005
child 792 68792fb2870f
permissions -rw-r--r--
Add bib->dox converter and initial references.bib (#184)
kpeter@790
     1
#!/usr/bin/env /usr/local/Python/bin/python2.1
kpeter@790
     2
"""
kpeter@790
     3
  BibTeX to Doxygen converter
kpeter@790
     4
  Usage: python bib2dox.py bibfile.bib > bibfile.dox
kpeter@790
     5
kpeter@790
     6
  This code is the modification of the BibTeX to XML converter
kpeter@790
     7
  by Vidar Bronken Gundersen et al. See the original copyright notices below. 
kpeter@790
     8
kpeter@790
     9
  **********************************************************************
kpeter@790
    10
kpeter@790
    11
  Decoder for bibliographic data, BibTeX
kpeter@790
    12
  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
kpeter@790
    13
kpeter@790
    14
  v.8
kpeter@790
    15
  (c)2002-06-23 Vidar Bronken Gundersen
kpeter@790
    16
  http://bibtexml.sf.net/
kpeter@790
    17
  Reuse approved as long as this notification is kept.
kpeter@790
    18
  Licence: GPL.
kpeter@790
    19
kpeter@790
    20
  Contributions/thanks to:
kpeter@790
    21
  Egon Willighagen, http://sf.net/projects/jreferences/
kpeter@790
    22
  Richard Mahoney (for providing a test case)
kpeter@790
    23
kpeter@790
    24
  Editted by Sara Sprenkle to be more robust and handle more bibtex features.
kpeter@790
    25
  (c) 2003-01-15
kpeter@790
    26
kpeter@790
    27
  1.  Changed bibtex: tags to bibxml: tags.
kpeter@790
    28
  2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
kpeter@790
    29
  3.  Allow spaces between @type and first {
kpeter@790
    30
  4.  "author" fields with multiple authors split by " and "
kpeter@790
    31
      are put in separate xml "bibxml:author" tags.
kpeter@790
    32
  5.  Option for Titles: words are capitalized
kpeter@790
    33
      only if first letter in title or capitalized inside braces
kpeter@790
    34
  6.  Removes braces from within field values
kpeter@790
    35
  7.  Ignores comments in bibtex file (including @comment{ or % )
kpeter@790
    36
  8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
kpeter@790
    37
  9.  Handles bibtex @string abbreviations
kpeter@790
    38
        --> includes bibtex's default abbreviations for months
kpeter@790
    39
        --> does concatenation of abbr # " more " and " more " # abbr
kpeter@790
    40
  10. Handles @type( ... ) or @type{ ... }
kpeter@790
    41
  11. The keywords field is split on , or ; and put into separate xml
kpeter@790
    42
      "bibxml:keywords" tags
kpeter@790
    43
  12. Ignores @preamble
kpeter@790
    44
kpeter@790
    45
  Known Limitations
kpeter@790
    46
  1.  Does not transform Latex encoding like math mode and special
kpeter@790
    47
      latex symbols.
kpeter@790
    48
  2.  Does not parse author fields into first and last names.
kpeter@790
    49
      E.g., It does not do anything special to an author whose name is
kpeter@790
    50
      in the form LAST_NAME, FIRST_NAME
kpeter@790
    51
      In "author" tag, will show up as
kpeter@790
    52
      <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
kpeter@790
    53
  3.  Does not handle "crossref" fields other than to print
kpeter@790
    54
      <bibxml:crossref>...</bibxml:crossref>
kpeter@790
    55
  4.  Does not inform user of the input's format errors.  You just won't
kpeter@790
    56
      be able to transform the file later with XSL
kpeter@790
    57
kpeter@790
    58
  You will have to manually edit the XML output if you need to handle
kpeter@790
    59
  these (and unknown) limitations.
kpeter@790
    60
kpeter@790
    61
"""
kpeter@790
    62
kpeter@790
    63
import string, re
kpeter@790
    64
kpeter@790
    65
# set of valid name characters
kpeter@790
    66
valid_name_chars = '[\w\-:]'
kpeter@790
    67
kpeter@790
    68
#
kpeter@790
    69
# define global regular expression variables
kpeter@790
    70
#
kpeter@790
    71
author_rex = re.compile('\s+and\s+')
kpeter@790
    72
rembraces_rex = re.compile('[{}]')
kpeter@790
    73
capitalize_rex = re.compile('({\w*})')
kpeter@790
    74
kpeter@790
    75
# used by bibtexkeywords(data)
kpeter@790
    76
keywords_rex = re.compile('[,;]')
kpeter@790
    77
kpeter@790
    78
# used by concat_line(line)
kpeter@790
    79
concatsplit_rex = re.compile('\s*#\s*')
kpeter@790
    80
kpeter@790
    81
# split on {, }, or " in verify_out_of_braces
kpeter@790
    82
delimiter_rex = re.compile('([{}"])',re.I)
kpeter@790
    83
kpeter@790
    84
field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@790
    85
data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
kpeter@790
    86
kpeter@790
    87
url_rex = re.compile('\\\url\{([^}]*)\}')
kpeter@790
    88
kpeter@790
    89
kpeter@790
    90
#
kpeter@790
    91
# return the string parameter without braces
kpeter@790
    92
#
kpeter@790
    93
def transformurls(str):
kpeter@790
    94
    return url_rex.sub(r'<a href="\1">\1</a>', str)
kpeter@790
    95
kpeter@790
    96
#
kpeter@790
    97
# return the string parameter without braces
kpeter@790
    98
#
kpeter@790
    99
def removebraces(str):
kpeter@790
   100
    return rembraces_rex.sub('', str)
kpeter@790
   101
kpeter@790
   102
#
kpeter@790
   103
# latex-specific replacements
kpeter@790
   104
# (do this after braces were removed)
kpeter@790
   105
#
kpeter@790
   106
def latexreplacements(line):
kpeter@790
   107
    line = string.replace(line, '~', '&nbsp;')
kpeter@790
   108
    line = string.replace(line, '\\\'a', '&aacute;')
kpeter@790
   109
    line = string.replace(line, '\\"a', '&auml;')
kpeter@790
   110
    line = string.replace(line, '\\\'e', '&eacute;')
kpeter@790
   111
    line = string.replace(line, '\\"e', '&euml;')
kpeter@790
   112
    line = string.replace(line, '\\\'i', '&iacute;')
kpeter@790
   113
    line = string.replace(line, '\\"i', '&iuml;')
kpeter@790
   114
    line = string.replace(line, '\\\'o', '&oacute;')
kpeter@790
   115
    line = string.replace(line, '\\"o', '&ouml;')
kpeter@790
   116
    line = string.replace(line, '\\\'u', '&uacute;')
kpeter@790
   117
    line = string.replace(line, '\\"u', '&uuml;')
kpeter@790
   118
    line = string.replace(line, '\\H o', '&otilde;')
kpeter@790
   119
    line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
kpeter@790
   120
    line = string.replace(line, '\\\'A', '&Aacute;')
kpeter@790
   121
    line = string.replace(line, '\\"A', '&Auml;')
kpeter@790
   122
    line = string.replace(line, '\\\'E', '&Eacute;')
kpeter@790
   123
    line = string.replace(line, '\\"E', '&Euml;')
kpeter@790
   124
    line = string.replace(line, '\\\'I', '&Iacute;')
kpeter@790
   125
    line = string.replace(line, '\\"I', '&Iuml;')
kpeter@790
   126
    line = string.replace(line, '\\\'O', '&Oacute;')
kpeter@790
   127
    line = string.replace(line, '\\"O', '&Ouml;')
kpeter@790
   128
    line = string.replace(line, '\\\'U', '&Uacute;')
kpeter@790
   129
    line = string.replace(line, '\\"U', '&Uuml;')
kpeter@790
   130
    line = string.replace(line, '\\H O', '&Otilde;')
kpeter@790
   131
    line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
kpeter@790
   132
kpeter@790
   133
    return line
kpeter@790
   134
kpeter@790
   135
#
kpeter@790
   136
# copy characters form a string decoding html expressions (&xyz;)
kpeter@790
   137
#
kpeter@790
   138
def copychars(str, ifrom, count):
kpeter@790
   139
    result = ''
kpeter@790
   140
    i = ifrom
kpeter@790
   141
    c = 0
kpeter@790
   142
    html_spec = False
kpeter@790
   143
    while (i < len(str)) and (c < count):
kpeter@790
   144
        if str[i] == '&':
kpeter@790
   145
            html_spec = True;
kpeter@790
   146
            if i+1 < len(str):
kpeter@790
   147
                result += str[i+1]
kpeter@790
   148
            c += 1
kpeter@790
   149
            i += 2
kpeter@790
   150
        else:
kpeter@790
   151
            if not html_spec:
kpeter@790
   152
                if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
kpeter@790
   153
                   ((str[i] >= 'a') and (str[i] <= 'z')):
kpeter@790
   154
                    result += str[i]
kpeter@790
   155
                    c += 1
kpeter@790
   156
            elif str[i] == ';':
kpeter@790
   157
                html_spec = False;
kpeter@790
   158
            i += 1
kpeter@790
   159
    
kpeter@790
   160
    return result
kpeter@790
   161
kpeter@790
   162
kpeter@790
   163
# 
kpeter@790
   164
# Handle a list of authors (separated by 'and').
kpeter@790
   165
# It gives back an array of the follwing values:
kpeter@790
   166
#  - num: the number of authors,
kpeter@790
   167
#  - list: the list of the author names,
kpeter@790
   168
#  - text: the bibtex text (separated by commas and/or 'and')
kpeter@790
   169
#  - abbrev: abbreviation that can be used for indicate the
kpeter@790
   170
#    bibliography entries
kpeter@790
   171
#
kpeter@790
   172
def bibtexauthor(data):
kpeter@790
   173
    result = {}
kpeter@790
   174
    bibtex = ''
kpeter@790
   175
    result['list'] = author_rex.split(data)
kpeter@790
   176
    result['num'] = len(result['list'])
kpeter@790
   177
    for i, author in enumerate(result['list']):
kpeter@790
   178
        # general transformations
kpeter@790
   179
        author = latexreplacements(removebraces(author.strip()))
kpeter@790
   180
        # transform "Xyz, A. B." to "A. B. Xyz"
kpeter@790
   181
        pos = author.find(',')
kpeter@790
   182
        if pos != -1:
kpeter@790
   183
            author = author[pos+1:].strip() + ' ' + author[:pos].strip()
kpeter@790
   184
        result['list'][i] = author
kpeter@790
   185
        bibtex += author + '#'
kpeter@790
   186
    bibtex = bibtex[:-1]
kpeter@790
   187
    if result['num'] > 1:
kpeter@790
   188
        ix = bibtex.rfind('#')
kpeter@790
   189
        if result['num'] == 2:
kpeter@790
   190
            bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
kpeter@790
   191
        else:
kpeter@790
   192
            bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
kpeter@790
   193
    bibtex = bibtex.replace('#', ', ')
kpeter@790
   194
    result['text'] = bibtex
kpeter@790
   195
    
kpeter@790
   196
    result['abbrev'] = ''
kpeter@790
   197
    for author in result['list']:
kpeter@790
   198
        pos = author.rfind(' ') + 1
kpeter@790
   199
        count = 1
kpeter@790
   200
        if result['num'] == 1:
kpeter@790
   201
            count = 3
kpeter@790
   202
        result['abbrev'] += copychars(author, pos, count)
kpeter@790
   203
kpeter@790
   204
    return result
kpeter@790
   205
kpeter@790
   206
kpeter@790
   207
#
kpeter@790
   208
# data = title string
kpeter@790
   209
# @return the capitalized title (first letter is capitalized), rest are capitalized
kpeter@790
   210
# only if capitalized inside braces
kpeter@790
   211
#
kpeter@790
   212
def capitalizetitle(data):
kpeter@790
   213
    title_list = capitalize_rex.split(data)
kpeter@790
   214
    title = ''
kpeter@790
   215
    count = 0
kpeter@790
   216
    for phrase in title_list:
kpeter@790
   217
         check = string.lstrip(phrase)
kpeter@790
   218
kpeter@790
   219
         # keep phrase's capitalization the same
kpeter@790
   220
         if check.find('{') == 0:
kpeter@790
   221
              title += removebraces(phrase)
kpeter@790
   222
         else:
kpeter@790
   223
         # first word --> capitalize first letter (after spaces)
kpeter@790
   224
              if count == 0:
kpeter@790
   225
                  title += check.capitalize()
kpeter@790
   226
              else:
kpeter@790
   227
                  title += phrase.lower()
kpeter@790
   228
         count = count + 1
kpeter@790
   229
kpeter@790
   230
    return title
kpeter@790
   231
kpeter@790
   232
kpeter@790
   233
#
kpeter@790
   234
# @return the bibtex for the title
kpeter@790
   235
# @param data --> title string
kpeter@790
   236
# braces are removed from title
kpeter@790
   237
#
kpeter@790
   238
def bibtextitle(data, entrytype):
kpeter@790
   239
    if entrytype in ('book', 'inbook'):
kpeter@790
   240
        title = removebraces(data.strip())
kpeter@790
   241
    else:
kpeter@790
   242
        title = removebraces(capitalizetitle(data.strip()))
kpeter@790
   243
    bibtex = title
kpeter@790
   244
    return bibtex
kpeter@790
   245
kpeter@790
   246
kpeter@790
   247
#
kpeter@790
   248
# function to compare entry lists
kpeter@790
   249
#
kpeter@790
   250
def entry_cmp(x, y):
kpeter@790
   251
    return cmp(x[0], y[0])
kpeter@790
   252
kpeter@790
   253
kpeter@790
   254
#
kpeter@790
   255
# print the XML for the transformed "filecont_source"
kpeter@790
   256
#
kpeter@790
   257
def bibtexdecoder(filecont_source):
kpeter@790
   258
    filecont = []
kpeter@790
   259
    file = []
kpeter@790
   260
    
kpeter@790
   261
    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
kpeter@790
   262
    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
kpeter@790
   263
    endtype_rex = re.compile('}\s*$')
kpeter@790
   264
    endtag_rex = re.compile('^\s*}\s*$')
kpeter@790
   265
kpeter@790
   266
    bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@790
   267
    bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
kpeter@790
   268
kpeter@790
   269
    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@790
   270
    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
kpeter@790
   271
kpeter@790
   272
    for line in filecont_source:
kpeter@790
   273
        line = line[:-1]
kpeter@790
   274
kpeter@790
   275
        # encode character entities
kpeter@790
   276
        line = string.replace(line, '&', '&amp;')
kpeter@790
   277
        line = string.replace(line, '<', '&lt;')
kpeter@790
   278
        line = string.replace(line, '>', '&gt;')
kpeter@790
   279
kpeter@790
   280
        # start entry: publication type (store for later use)
kpeter@790
   281
        if pubtype_rex.match(line):
kpeter@790
   282
        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
kpeter@790
   283
            entrycont = {}
kpeter@790
   284
            entry = []
kpeter@790
   285
            entrytype = pubtype_rex.sub('\g<1>',line)
kpeter@790
   286
            entrytype = string.lower(entrytype)
kpeter@790
   287
            # entryid   = pubtype_rex.sub('\g<2>', line)
kpeter@790
   288
kpeter@790
   289
        # end entry if just a }
kpeter@790
   290
        elif endtype_rex.match(line):
kpeter@790
   291
            # generate doxygen code for the entry
kpeter@790
   292
kpeter@790
   293
            # enty type related formattings
kpeter@790
   294
            if entrytype in ('book', 'inbook'):
kpeter@790
   295
                entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
kpeter@790
   296
                if not entrycont.has_key('author'):
kpeter@790
   297
                    entrycont['author'] = entrycont['editor']
kpeter@790
   298
                    entrycont['author']['text'] += ', editors'
kpeter@790
   299
            elif entrytype == 'article':
kpeter@790
   300
                entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
kpeter@790
   301
            elif entrytype in ('inproceedings', 'incollection', 'conference'):
kpeter@790
   302
                entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
kpeter@790
   303
            elif entrytype == 'techreport':
kpeter@790
   304
                if not entrycont.has_key('type'):
kpeter@790
   305
                    entrycont['type'] = 'Technical report'
kpeter@790
   306
            elif entrytype == 'mastersthesis':
kpeter@790
   307
                entrycont['type'] = 'Master\'s thesis'
kpeter@790
   308
            elif entrytype == 'phdthesis':
kpeter@790
   309
                entrycont['type'] = 'PhD thesis'
kpeter@790
   310
kpeter@790
   311
            for eline in entrycont:
kpeter@790
   312
                if eline != '':
kpeter@790
   313
                    eline = latexreplacements(eline)
kpeter@790
   314
kpeter@790
   315
            if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@790
   316
                entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
kpeter@790
   317
kpeter@790
   318
            if entrycont.has_key('author') and (entrycont['author'] != ''):
kpeter@790
   319
                entry.append(entrycont['author']['text'] + '.')
kpeter@790
   320
            if entrycont.has_key('title') and (entrycont['title'] != ''):
kpeter@790
   321
                entry.append(entrycont['title'] + '.')
kpeter@790
   322
            if entrycont.has_key('journal') and (entrycont['journal'] != ''):
kpeter@790
   323
                entry.append(entrycont['journal'] + ',')
kpeter@790
   324
            if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
kpeter@790
   325
                entry.append('In ' + entrycont['booktitle'] + ',')
kpeter@790
   326
            if entrycont.has_key('type') and (entrycont['type'] != ''):
kpeter@790
   327
                eline = entrycont['type']
kpeter@790
   328
                if entrycont.has_key('number') and (entrycont['number'] != ''):
kpeter@790
   329
                    eline += ' ' + entrycont['number']
kpeter@790
   330
                eline += ','
kpeter@790
   331
                entry.append(eline)
kpeter@790
   332
            if entrycont.has_key('institution') and (entrycont['institution'] != ''):
kpeter@790
   333
                entry.append(entrycont['institution'] + ',')
kpeter@790
   334
            if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
kpeter@790
   335
                entry.append(entrycont['publisher'] + ',')
kpeter@790
   336
            if entrycont.has_key('school') and (entrycont['school'] != ''):
kpeter@790
   337
                entry.append(entrycont['school'] + ',')
kpeter@790
   338
            if entrycont.has_key('address') and (entrycont['address'] != ''):
kpeter@790
   339
                entry.append(entrycont['address'] + ',')
kpeter@790
   340
            if entrycont.has_key('edition') and (entrycont['edition'] != ''):
kpeter@790
   341
                entry.append(entrycont['edition'] + ' edition,')
kpeter@790
   342
            if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
kpeter@790
   343
                entry.append(entrycont['howpublished'] + ',')
kpeter@790
   344
            if entrycont.has_key('volume') and (entrycont['volume'] != ''):
kpeter@790
   345
                eline = entrycont['volume'];
kpeter@790
   346
                if entrycont.has_key('number') and (entrycont['number'] != ''):
kpeter@790
   347
                    eline += '(' + entrycont['number'] + ')'
kpeter@790
   348
                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@790
   349
                    eline += ':' + entrycont['pages']
kpeter@790
   350
                eline += ','
kpeter@790
   351
                entry.append(eline)
kpeter@790
   352
            else:
kpeter@790
   353
                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@790
   354
                    entry.append('pages ' + entrycont['pages'] + ',')
kpeter@790
   355
            if entrycont.has_key('year') and (entrycont['year'] != ''):
kpeter@790
   356
                if entrycont.has_key('month') and (entrycont['month'] != ''):
kpeter@790
   357
                    entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
kpeter@790
   358
                else:
kpeter@790
   359
                    entry.append(entrycont['year'] + '.')
kpeter@790
   360
            if entrycont.has_key('note') and (entrycont['note'] != ''):
kpeter@790
   361
                entry.append(entrycont['note'] + '.')
kpeter@790
   362
kpeter@790
   363
            # generate keys for sorting and for the output
kpeter@790
   364
            sortkey = ''
kpeter@790
   365
            bibkey = ''
kpeter@790
   366
            if entrycont.has_key('author'):
kpeter@790
   367
                for author in entrycont['author']['list']:
kpeter@790
   368
                    sortkey += copychars(author, author.rfind(' ')+1, len(author))
kpeter@790
   369
                bibkey = entrycont['author']['abbrev']
kpeter@790
   370
            else:
kpeter@790
   371
                bibkey = 'x'
kpeter@790
   372
            if entrycont.has_key('year'):
kpeter@790
   373
                sortkey += entrycont['year']
kpeter@790
   374
                bibkey += entrycont['year'][-2:]
kpeter@790
   375
            if entrycont.has_key('title'):
kpeter@790
   376
                sortkey += entrycont['title']
kpeter@790
   377
            if entrycont.has_key('key'):
kpeter@790
   378
                sortkey = entrycont['key'] + sortkey
kpeter@790
   379
                bibkey = entrycont['key']
kpeter@790
   380
            entry.insert(0, sortkey)
kpeter@790
   381
            entry.insert(1, bibkey)
kpeter@790
   382
           
kpeter@790
   383
            # add the entry to the file contents
kpeter@790
   384
            filecont.append(entry)
kpeter@790
   385
kpeter@790
   386
        else:
kpeter@790
   387
            # field, publication info
kpeter@790
   388
            field = ''
kpeter@790
   389
            data = ''
kpeter@790
   390
            
kpeter@790
   391
            # field = {data} entries
kpeter@790
   392
            if bracedata_rex.match(line):
kpeter@790
   393
                field = bracefield_rex.sub('\g<1>', line)
kpeter@790
   394
                field = string.lower(field)
kpeter@790
   395
                data =  bracedata_rex.sub('\g<2>', line)
kpeter@790
   396
kpeter@790
   397
            # field = "data" entries
kpeter@790
   398
            elif quotedata_rex.match(line):
kpeter@790
   399
                field = quotefield_rex.sub('\g<1>', line)
kpeter@790
   400
                field = string.lower(field)
kpeter@790
   401
                data =  quotedata_rex.sub('\g<2>', line)
kpeter@790
   402
kpeter@790
   403
            # field = data entries
kpeter@790
   404
            elif data_rex.match(line):
kpeter@790
   405
                field = field_rex.sub('\g<1>', line)
kpeter@790
   406
                field = string.lower(field)
kpeter@790
   407
                data =  data_rex.sub('\g<2>', line)
kpeter@790
   408
            
kpeter@790
   409
            if field in ('author', 'editor'):
kpeter@790
   410
                entrycont[field] = bibtexauthor(data)
kpeter@790
   411
                line = ''
kpeter@790
   412
            elif field == 'title':
kpeter@790
   413
                line = bibtextitle(data, entrytype)
kpeter@790
   414
            elif field != '':
kpeter@790
   415
                line = removebraces(transformurls(data.strip()))
kpeter@790
   416
kpeter@790
   417
            if line != '':
kpeter@790
   418
                line = latexreplacements(line)
kpeter@790
   419
                entrycont[field] = line
kpeter@790
   420
kpeter@790
   421
kpeter@790
   422
    # sort entries
kpeter@790
   423
    filecont.sort(entry_cmp)
kpeter@790
   424
    
kpeter@790
   425
    # count the bibtex keys
kpeter@790
   426
    keytable = {}
kpeter@790
   427
    counttable = {}
kpeter@790
   428
    for entry in filecont:
kpeter@790
   429
        bibkey = entry[1]
kpeter@790
   430
        if not keytable.has_key(bibkey):
kpeter@790
   431
            keytable[bibkey] = 1
kpeter@790
   432
        else:
kpeter@790
   433
            keytable[bibkey] += 1
kpeter@790
   434
kpeter@790
   435
    for bibkey in keytable.keys():
kpeter@790
   436
        counttable[bibkey] = 0
kpeter@790
   437
    
kpeter@790
   438
    # generate output
kpeter@790
   439
    for entry in filecont:
kpeter@790
   440
        # generate output key form the bibtex key
kpeter@790
   441
        bibkey = entry[1]
kpeter@790
   442
        if keytable[bibkey] == 1:
kpeter@790
   443
            outkey = bibkey
kpeter@790
   444
        else:
kpeter@790
   445
            outkey = bibkey + chr(97 + counttable[bibkey])
kpeter@790
   446
        counttable[bibkey] += 1
kpeter@790
   447
        
kpeter@790
   448
        # append the entry code to the output
kpeter@790
   449
        file.append('<tr valign="top">\n' + \
kpeter@790
   450
                    '<td>[' + outkey + ']</td>')
kpeter@790
   451
        file.append('<td>')
kpeter@790
   452
        file.append('\\anchor ' + outkey)
kpeter@790
   453
        for line in entry[2:]:
kpeter@790
   454
            file.append(line)
kpeter@790
   455
        file.append('</td>\n</tr>')
kpeter@790
   456
        file.append('')
kpeter@790
   457
kpeter@790
   458
    return file
kpeter@790
   459
kpeter@790
   460
kpeter@790
   461
#
kpeter@790
   462
# return 1 iff abbr is in line but not inside braces or quotes
kpeter@790
   463
# assumes that abbr appears only once on the line (out of braces and quotes)
kpeter@790
   464
#
kpeter@790
   465
def verify_out_of_braces(line, abbr):
kpeter@790
   466
kpeter@790
   467
    phrase_split = delimiter_rex.split(line)
kpeter@790
   468
kpeter@790
   469
    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
kpeter@790
   470
kpeter@790
   471
    open_brace = 0
kpeter@790
   472
    open_quote = 0
kpeter@790
   473
kpeter@790
   474
    for phrase in phrase_split:
kpeter@790
   475
        if phrase == "{":
kpeter@790
   476
            open_brace = open_brace + 1
kpeter@790
   477
        elif phrase == "}":
kpeter@790
   478
            open_brace = open_brace - 1
kpeter@790
   479
        elif phrase == '"':
kpeter@790
   480
            if open_quote == 1:
kpeter@790
   481
                open_quote = 0
kpeter@790
   482
            else:
kpeter@790
   483
                open_quote = 1
kpeter@790
   484
        elif abbr_rex.search(phrase):
kpeter@790
   485
            if open_brace == 0 and open_quote == 0:
kpeter@790
   486
                return 1
kpeter@790
   487
kpeter@790
   488
    return 0
kpeter@790
   489
kpeter@790
   490
kpeter@790
   491
#
kpeter@790
   492
# a line in the form phrase1 # phrase2 # ... # phrasen
kpeter@790
   493
# is returned as phrase1 phrase2 ... phrasen
kpeter@790
   494
# with the correct punctuation
kpeter@790
   495
# Bug: Doesn't always work with multiple abbreviations plugged in
kpeter@790
   496
#
kpeter@790
   497
def concat_line(line):
kpeter@790
   498
    # only look at part after equals
kpeter@790
   499
    field = field_rex.sub('\g<1>',line)
kpeter@790
   500
    rest = field_rex.sub('\g<2>',line)
kpeter@790
   501
kpeter@790
   502
    concat_line = field + ' ='
kpeter@790
   503
kpeter@790
   504
    pound_split = concatsplit_rex.split(rest)
kpeter@790
   505
kpeter@790
   506
    phrase_count = 0
kpeter@790
   507
    length = len(pound_split)
kpeter@790
   508
kpeter@790
   509
    for phrase in pound_split:
kpeter@790
   510
        phrase = phrase.strip()
kpeter@790
   511
        if phrase_count != 0:
kpeter@790
   512
            if phrase.startswith('"') or phrase.startswith('{'):
kpeter@790
   513
                phrase = phrase[1:]
kpeter@790
   514
        elif phrase.startswith('"'):
kpeter@790
   515
            phrase = phrase.replace('"','{',1)
kpeter@790
   516
kpeter@790
   517
        if phrase_count != length-1:
kpeter@790
   518
            if phrase.endswith('"') or phrase.endswith('}'):
kpeter@790
   519
                phrase = phrase[:-1]
kpeter@790
   520
        else:
kpeter@790
   521
            if phrase.endswith('"'):
kpeter@790
   522
                phrase = phrase[:-1]
kpeter@790
   523
                phrase = phrase + "}"
kpeter@790
   524
            elif phrase.endswith('",'):
kpeter@790
   525
                phrase = phrase[:-2]
kpeter@790
   526
                phrase = phrase + "},"
kpeter@790
   527
kpeter@790
   528
        # if phrase did have \#, add the \# back
kpeter@790
   529
        if phrase.endswith('\\'):
kpeter@790
   530
            phrase = phrase + "#"
kpeter@790
   531
        concat_line = concat_line + ' ' + phrase
kpeter@790
   532
kpeter@790
   533
        phrase_count = phrase_count + 1
kpeter@790
   534
kpeter@790
   535
    return concat_line
kpeter@790
   536
kpeter@790
   537
kpeter@790
   538
#
kpeter@790
   539
# substitute abbreviations into filecont
kpeter@790
   540
# @param filecont_source - string of data from file
kpeter@790
   541
#
kpeter@790
   542
def bibtex_replace_abbreviations(filecont_source):
kpeter@790
   543
    filecont = filecont_source.splitlines()
kpeter@790
   544
kpeter@790
   545
    #  These are defined in bibtex, so we'll define them too
kpeter@790
   546
    abbr_list = ['jan','feb','mar','apr','may','jun',
kpeter@790
   547
                 'jul','aug','sep','oct','nov','dec']
kpeter@790
   548
    value_list = ['January','February','March','April',
kpeter@790
   549
                  'May','June','July','August','September',
kpeter@790
   550
                  'October','November','December']
kpeter@790
   551
kpeter@790
   552
    abbr_rex = []
kpeter@790
   553
    total_abbr_count = 0
kpeter@790
   554
kpeter@790
   555
    front = '\\b'
kpeter@790
   556
    back = '(,?)\\b'
kpeter@790
   557
kpeter@790
   558
    for x in abbr_list:
kpeter@790
   559
        abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
kpeter@790
   560
        total_abbr_count = total_abbr_count + 1
kpeter@790
   561
kpeter@790
   562
kpeter@790
   563
    abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
kpeter@790
   564
                             re.I)
kpeter@790
   565
kpeter@790
   566
    comment_rex = re.compile('@comment\s*{',re.I)
kpeter@790
   567
    preamble_rex = re.compile('@preamble\s*{',re.I)
kpeter@790
   568
kpeter@790
   569
    waiting_for_end_string = 0
kpeter@790
   570
    i = 0
kpeter@790
   571
    filecont2 = ''
kpeter@790
   572
kpeter@790
   573
    for line in filecont:
kpeter@790
   574
        if line == ' ' or line == '':
kpeter@790
   575
            continue
kpeter@790
   576
kpeter@790
   577
        if waiting_for_end_string:
kpeter@790
   578
            if re.search('}',line):
kpeter@790
   579
                waiting_for_end_string = 0
kpeter@790
   580
                continue
kpeter@790
   581
kpeter@790
   582
        if abbrdef_rex.search(line):
kpeter@790
   583
            abbr = abbrdef_rex.sub('\g<1>', line)
kpeter@790
   584
kpeter@790
   585
            if abbr_list.count(abbr) == 0:
kpeter@790
   586
                val = abbrdef_rex.sub('\g<2>', line)
kpeter@790
   587
                abbr_list.append(abbr)
kpeter@790
   588
                value_list.append(string.strip(val))
kpeter@790
   589
                abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
kpeter@790
   590
                total_abbr_count = total_abbr_count + 1
kpeter@790
   591
            waiting_for_end_string = 1
kpeter@790
   592
            continue
kpeter@790
   593
kpeter@790
   594
        if comment_rex.search(line):
kpeter@790
   595
            waiting_for_end_string = 1
kpeter@790
   596
            continue
kpeter@790
   597
kpeter@790
   598
        if preamble_rex.search(line):
kpeter@790
   599
            waiting_for_end_string = 1
kpeter@790
   600
            continue
kpeter@790
   601
kpeter@790
   602
kpeter@790
   603
        # replace subsequent abbreviations with the value
kpeter@790
   604
        abbr_count = 0
kpeter@790
   605
kpeter@790
   606
        for x in abbr_list:
kpeter@790
   607
kpeter@790
   608
            if abbr_rex[abbr_count].search(line):
kpeter@790
   609
                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
kpeter@790
   610
                    line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
kpeter@790
   611
                # Check for # concatenations
kpeter@790
   612
                if concatsplit_rex.search(line):
kpeter@790
   613
                    line = concat_line(line)
kpeter@790
   614
            abbr_count = abbr_count + 1
kpeter@790
   615
kpeter@790
   616
kpeter@790
   617
        filecont2 = filecont2 + line + '\n'
kpeter@790
   618
        i = i+1
kpeter@790
   619
kpeter@790
   620
kpeter@790
   621
    # Do one final pass over file
kpeter@790
   622
kpeter@790
   623
    # make sure that didn't end up with {" or }" after the substitution
kpeter@790
   624
    filecont2 = filecont2.replace('{"','{{')
kpeter@790
   625
    filecont2 = filecont2.replace('"}','}}')
kpeter@790
   626
kpeter@790
   627
    afterquotevalue_rex = re.compile('"\s*,\s*')
kpeter@790
   628
    afterbrace_rex = re.compile('"\s*}')
kpeter@790
   629
    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
kpeter@790
   630
kpeter@790
   631
    # add new lines to data that changed because of abbreviation substitutions
kpeter@790
   632
    filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
kpeter@790
   633
    filecont2 = afterbrace_rex.sub('"\n}', filecont2)
kpeter@790
   634
    filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
kpeter@790
   635
kpeter@790
   636
    return filecont2
kpeter@790
   637
kpeter@790
   638
#
kpeter@790
   639
# convert @type( ... ) to @type{ ... }
kpeter@790
   640
#
kpeter@790
   641
def no_outer_parens(filecont):
kpeter@790
   642
kpeter@790
   643
    # do checking for open parens
kpeter@790
   644
    # will convert to braces
kpeter@790
   645
    paren_split = re.split('([(){}])',filecont)
kpeter@790
   646
kpeter@790
   647
    open_paren_count = 0
kpeter@790
   648
    open_type = 0
kpeter@790
   649
    look_next = 0
kpeter@790
   650
kpeter@790
   651
    # rebuild filecont
kpeter@790
   652
    filecont = ''
kpeter@790
   653
kpeter@790
   654
    at_rex = re.compile('@\w*')
kpeter@790
   655
kpeter@790
   656
    for phrase in paren_split:
kpeter@790
   657
        if look_next == 1:
kpeter@790
   658
            if phrase == '(':
kpeter@790
   659
                phrase = '{'
kpeter@790
   660
                open_paren_count = open_paren_count + 1
kpeter@790
   661
            else:
kpeter@790
   662
                open_type = 0
kpeter@790
   663
            look_next = 0
kpeter@790
   664
kpeter@790
   665
        if phrase == '(':
kpeter@790
   666
            open_paren_count = open_paren_count + 1
kpeter@790
   667
kpeter@790
   668
        elif phrase == ')':
kpeter@790
   669
            open_paren_count = open_paren_count - 1
kpeter@790
   670
            if open_type == 1 and open_paren_count == 0:
kpeter@790
   671
                phrase = '}'
kpeter@790
   672
                open_type = 0
kpeter@790
   673
kpeter@790
   674
        elif at_rex.search( phrase ):
kpeter@790
   675
            open_type = 1
kpeter@790
   676
            look_next = 1
kpeter@790
   677
kpeter@790
   678
        filecont = filecont + phrase
kpeter@790
   679
kpeter@790
   680
    return filecont
kpeter@790
   681
kpeter@790
   682
kpeter@790
   683
#
kpeter@790
   684
# make all whitespace into just one space
kpeter@790
   685
# format the bibtex file into a usable form.
kpeter@790
   686
#
kpeter@790
   687
def bibtexwasher(filecont_source):
kpeter@790
   688
kpeter@790
   689
    space_rex = re.compile('\s+')
kpeter@790
   690
    comment_rex = re.compile('\s*%')
kpeter@790
   691
kpeter@790
   692
    filecont = []
kpeter@790
   693
kpeter@790
   694
    # remove trailing and excessive whitespace
kpeter@790
   695
    # ignore comments
kpeter@790
   696
    for line in filecont_source:
kpeter@790
   697
        line = string.strip(line)
kpeter@790
   698
        line = space_rex.sub(' ', line)
kpeter@790
   699
        # ignore comments
kpeter@790
   700
        if not comment_rex.match(line) and line != '':
kpeter@790
   701
            filecont.append(' '+ line)
kpeter@790
   702
kpeter@790
   703
    filecont = string.join(filecont, '')
kpeter@790
   704
kpeter@790
   705
    # the file is in one long string
kpeter@790
   706
kpeter@790
   707
    filecont = no_outer_parens(filecont)
kpeter@790
   708
kpeter@790
   709
    #
kpeter@790
   710
    # split lines according to preferred syntax scheme
kpeter@790
   711
    #
kpeter@790
   712
    filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
kpeter@790
   713
kpeter@790
   714
    # add new lines after commas that are after values
kpeter@790
   715
    filecont = re.sub('"\s*,', '",\n', filecont)
kpeter@790
   716
    filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
kpeter@790
   717
    filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
kpeter@790
   718
                          '\n\n\g<1>\g<2>,\n', filecont)
kpeter@790
   719
kpeter@790
   720
    # add new lines after }
kpeter@790
   721
    filecont = re.sub('"\s*}','"\n}\n', filecont)
kpeter@790
   722
    filecont = re.sub('}\s*,','},\n', filecont)
kpeter@790
   723
kpeter@790
   724
kpeter@790
   725
    filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
kpeter@790
   726
kpeter@790
   727
    # character encoding, reserved latex characters
kpeter@790
   728
    filecont = re.sub('{\\\&}', '&', filecont)
kpeter@790
   729
    filecont = re.sub('\\\&', '&', filecont)
kpeter@790
   730
kpeter@790
   731
    # do checking for open braces to get format correct
kpeter@790
   732
    open_brace_count = 0
kpeter@790
   733
    brace_split = re.split('([{}])',filecont)
kpeter@790
   734
kpeter@790
   735
    # rebuild filecont
kpeter@790
   736
    filecont = ''
kpeter@790
   737
kpeter@790
   738
    for phrase in brace_split:
kpeter@790
   739
        if phrase == '{':
kpeter@790
   740
            open_brace_count = open_brace_count + 1
kpeter@790
   741
        elif phrase == '}':
kpeter@790
   742
            open_brace_count = open_brace_count - 1
kpeter@790
   743
            if open_brace_count == 0:
kpeter@790
   744
                filecont = filecont + '\n'
kpeter@790
   745
kpeter@790
   746
        filecont = filecont + phrase
kpeter@790
   747
kpeter@790
   748
    filecont2 = bibtex_replace_abbreviations(filecont)
kpeter@790
   749
kpeter@790
   750
    # gather
kpeter@790
   751
    filecont = filecont2.splitlines()
kpeter@790
   752
    i=0
kpeter@790
   753
    j=0         # count the number of blank lines
kpeter@790
   754
    for line in filecont:
kpeter@790
   755
        # ignore blank lines
kpeter@790
   756
        if line == '' or line == ' ':
kpeter@790
   757
            j = j+1
kpeter@790
   758
            continue
kpeter@790
   759
        filecont[i] = line + '\n'
kpeter@790
   760
        i = i+1
kpeter@790
   761
kpeter@790
   762
    # get rid of the extra stuff at the end of the array
kpeter@790
   763
    # (The extra stuff are duplicates that are in the array because
kpeter@790
   764
    # blank lines were removed.)
kpeter@790
   765
    length = len( filecont)
kpeter@790
   766
    filecont[length-j:length] = []
kpeter@790
   767
kpeter@790
   768
    return filecont
kpeter@790
   769
kpeter@790
   770
kpeter@790
   771
def filehandler(filepath):
kpeter@790
   772
    try:
kpeter@790
   773
        fd = open(filepath, 'r')
kpeter@790
   774
        filecont_source = fd.readlines()
kpeter@790
   775
        fd.close()
kpeter@790
   776
    except:
kpeter@790
   777
        print 'Could not open file:', filepath
kpeter@790
   778
    washeddata = bibtexwasher(filecont_source)
kpeter@790
   779
    outdata = bibtexdecoder(washeddata)
kpeter@790
   780
    print '/**'
kpeter@790
   781
    print '\page references References'
kpeter@790
   782
    print
kpeter@790
   783
    print '<table border="0" cellspacing="5px" width="100%">'
kpeter@790
   784
    print
kpeter@790
   785
    for line in outdata:
kpeter@790
   786
        print line
kpeter@790
   787
    print '</table>'
kpeter@790
   788
    print
kpeter@790
   789
    print '*/'
kpeter@790
   790
kpeter@790
   791
kpeter@790
   792
# main program
kpeter@790
   793
kpeter@790
   794
def main():
kpeter@790
   795
    import sys
kpeter@790
   796
    if sys.argv[1:]:
kpeter@790
   797
        filepath = sys.argv[1]
kpeter@790
   798
    else:
kpeter@790
   799
        print "No input file"
kpeter@790
   800
        sys.exit()
kpeter@790
   801
    filehandler(filepath)
kpeter@790
   802
kpeter@790
   803
if __name__ == "__main__": main()
kpeter@790
   804
kpeter@790
   805
kpeter@790
   806
# end python script