scripts/bib2dox.py
author Alpar Juttner <alpar@cs.elte.hu>
Tue, 26 Apr 2011 17:25:00 +0200
changeset 943 4f9e5801224e
parent 754 2de0fc630899
child 1052 eb2f9d453070
permissions -rwxr-xr-x
Merge bugfix #420
kpeter@836
     1
#! /usr/bin/env python
kpeter@743
     2
"""
kpeter@743
     3
  BibTeX to Doxygen converter
kpeter@743
     4
  Usage: python bib2dox.py bibfile.bib > bibfile.dox
kpeter@743
     5
kpeter@836
     6
  This file is a part of LEMON, a generic C++ optimization library.
kpeter@836
     7
kpeter@836
     8
  **********************************************************************
kpeter@836
     9
kpeter@743
    10
  This code is the modification of the BibTeX to XML converter
kpeter@836
    11
  by Vidar Bronken Gundersen et al.
kpeter@836
    12
  See the original copyright notices below. 
kpeter@743
    13
kpeter@743
    14
  **********************************************************************
kpeter@743
    15
kpeter@743
    16
  Decoder for bibliographic data, BibTeX
kpeter@743
    17
  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
kpeter@743
    18
kpeter@743
    19
  v.8
kpeter@743
    20
  (c)2002-06-23 Vidar Bronken Gundersen
kpeter@743
    21
  http://bibtexml.sf.net/
kpeter@743
    22
  Reuse approved as long as this notification is kept.
kpeter@743
    23
  Licence: GPL.
kpeter@743
    24
kpeter@743
    25
  Contributions/thanks to:
kpeter@743
    26
  Egon Willighagen, http://sf.net/projects/jreferences/
kpeter@743
    27
  Richard Mahoney (for providing a test case)
kpeter@743
    28
kpeter@743
    29
  Editted by Sara Sprenkle to be more robust and handle more bibtex features.
kpeter@743
    30
  (c) 2003-01-15
kpeter@743
    31
kpeter@743
    32
  1.  Changed bibtex: tags to bibxml: tags.
kpeter@743
    33
  2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
kpeter@743
    34
  3.  Allow spaces between @type and first {
kpeter@743
    35
  4.  "author" fields with multiple authors split by " and "
kpeter@743
    36
      are put in separate xml "bibxml:author" tags.
kpeter@743
    37
  5.  Option for Titles: words are capitalized
kpeter@743
    38
      only if first letter in title or capitalized inside braces
kpeter@743
    39
  6.  Removes braces from within field values
kpeter@743
    40
  7.  Ignores comments in bibtex file (including @comment{ or % )
kpeter@743
    41
  8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
kpeter@743
    42
  9.  Handles bibtex @string abbreviations
kpeter@743
    43
        --> includes bibtex's default abbreviations for months
kpeter@743
    44
        --> does concatenation of abbr # " more " and " more " # abbr
kpeter@743
    45
  10. Handles @type( ... ) or @type{ ... }
kpeter@743
    46
  11. The keywords field is split on , or ; and put into separate xml
kpeter@743
    47
      "bibxml:keywords" tags
kpeter@743
    48
  12. Ignores @preamble
kpeter@743
    49
kpeter@743
    50
  Known Limitations
kpeter@743
    51
  1.  Does not transform Latex encoding like math mode and special
kpeter@743
    52
      latex symbols.
kpeter@743
    53
  2.  Does not parse author fields into first and last names.
kpeter@743
    54
      E.g., It does not do anything special to an author whose name is
kpeter@743
    55
      in the form LAST_NAME, FIRST_NAME
kpeter@743
    56
      In "author" tag, will show up as
kpeter@743
    57
      <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
kpeter@743
    58
  3.  Does not handle "crossref" fields other than to print
kpeter@743
    59
      <bibxml:crossref>...</bibxml:crossref>
kpeter@743
    60
  4.  Does not inform user of the input's format errors.  You just won't
kpeter@743
    61
      be able to transform the file later with XSL
kpeter@743
    62
kpeter@743
    63
  You will have to manually edit the XML output if you need to handle
kpeter@743
    64
  these (and unknown) limitations.
kpeter@743
    65
kpeter@743
    66
"""
kpeter@743
    67
kpeter@743
    68
import string, re
kpeter@743
    69
kpeter@743
    70
# set of valid name characters
kpeter@743
    71
valid_name_chars = '[\w\-:]'
kpeter@743
    72
kpeter@743
    73
#
kpeter@743
    74
# define global regular expression variables
kpeter@743
    75
#
kpeter@743
    76
author_rex = re.compile('\s+and\s+')
kpeter@743
    77
rembraces_rex = re.compile('[{}]')
kpeter@754
    78
capitalize_rex = re.compile('({[^}]*})')
kpeter@743
    79
kpeter@743
    80
# used by bibtexkeywords(data)
kpeter@743
    81
keywords_rex = re.compile('[,;]')
kpeter@743
    82
kpeter@743
    83
# used by concat_line(line)
kpeter@743
    84
concatsplit_rex = re.compile('\s*#\s*')
kpeter@743
    85
kpeter@743
    86
# split on {, }, or " in verify_out_of_braces
kpeter@743
    87
delimiter_rex = re.compile('([{}"])',re.I)
kpeter@743
    88
kpeter@743
    89
field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@743
    90
data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
kpeter@743
    91
kpeter@743
    92
url_rex = re.compile('\\\url\{([^}]*)\}')
kpeter@743
    93
kpeter@745
    94
#
kpeter@745
    95
# styles for html formatting
kpeter@745
    96
#
kpeter@745
    97
divstyle = 'margin-top: -4ex; margin-left: 8em;'
kpeter@743
    98
kpeter@743
    99
#
kpeter@743
   100
# return the string parameter without braces
kpeter@743
   101
#
kpeter@743
   102
def transformurls(str):
kpeter@743
   103
    return url_rex.sub(r'<a href="\1">\1</a>', str)
kpeter@743
   104
kpeter@743
   105
#
kpeter@743
   106
# return the string parameter without braces
kpeter@743
   107
#
kpeter@743
   108
def removebraces(str):
kpeter@743
   109
    return rembraces_rex.sub('', str)
kpeter@743
   110
kpeter@743
   111
#
kpeter@743
   112
# latex-specific replacements
kpeter@743
   113
# (do this after braces were removed)
kpeter@743
   114
#
kpeter@743
   115
def latexreplacements(line):
kpeter@743
   116
    line = string.replace(line, '~', '&nbsp;')
kpeter@743
   117
    line = string.replace(line, '\\\'a', '&aacute;')
kpeter@743
   118
    line = string.replace(line, '\\"a', '&auml;')
kpeter@743
   119
    line = string.replace(line, '\\\'e', '&eacute;')
kpeter@743
   120
    line = string.replace(line, '\\"e', '&euml;')
kpeter@743
   121
    line = string.replace(line, '\\\'i', '&iacute;')
kpeter@743
   122
    line = string.replace(line, '\\"i', '&iuml;')
kpeter@743
   123
    line = string.replace(line, '\\\'o', '&oacute;')
kpeter@743
   124
    line = string.replace(line, '\\"o', '&ouml;')
kpeter@743
   125
    line = string.replace(line, '\\\'u', '&uacute;')
kpeter@743
   126
    line = string.replace(line, '\\"u', '&uuml;')
kpeter@743
   127
    line = string.replace(line, '\\H o', '&otilde;')
kpeter@743
   128
    line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
kpeter@743
   129
    line = string.replace(line, '\\\'A', '&Aacute;')
kpeter@743
   130
    line = string.replace(line, '\\"A', '&Auml;')
kpeter@743
   131
    line = string.replace(line, '\\\'E', '&Eacute;')
kpeter@743
   132
    line = string.replace(line, '\\"E', '&Euml;')
kpeter@743
   133
    line = string.replace(line, '\\\'I', '&Iacute;')
kpeter@743
   134
    line = string.replace(line, '\\"I', '&Iuml;')
kpeter@743
   135
    line = string.replace(line, '\\\'O', '&Oacute;')
kpeter@743
   136
    line = string.replace(line, '\\"O', '&Ouml;')
kpeter@743
   137
    line = string.replace(line, '\\\'U', '&Uacute;')
kpeter@743
   138
    line = string.replace(line, '\\"U', '&Uuml;')
kpeter@743
   139
    line = string.replace(line, '\\H O', '&Otilde;')
kpeter@743
   140
    line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
kpeter@743
   141
kpeter@743
   142
    return line
kpeter@743
   143
kpeter@743
   144
#
kpeter@743
   145
# copy characters form a string decoding html expressions (&xyz;)
kpeter@743
   146
#
kpeter@743
   147
def copychars(str, ifrom, count):
kpeter@743
   148
    result = ''
kpeter@743
   149
    i = ifrom
kpeter@743
   150
    c = 0
kpeter@743
   151
    html_spec = False
kpeter@743
   152
    while (i < len(str)) and (c < count):
kpeter@743
   153
        if str[i] == '&':
kpeter@743
   154
            html_spec = True;
kpeter@743
   155
            if i+1 < len(str):
kpeter@743
   156
                result += str[i+1]
kpeter@743
   157
            c += 1
kpeter@743
   158
            i += 2
kpeter@743
   159
        else:
kpeter@743
   160
            if not html_spec:
kpeter@743
   161
                if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
kpeter@743
   162
                   ((str[i] >= 'a') and (str[i] <= 'z')):
kpeter@743
   163
                    result += str[i]
kpeter@743
   164
                    c += 1
kpeter@743
   165
            elif str[i] == ';':
kpeter@743
   166
                html_spec = False;
kpeter@743
   167
            i += 1
kpeter@743
   168
    
kpeter@743
   169
    return result
kpeter@743
   170
kpeter@743
   171
kpeter@743
   172
# 
kpeter@743
   173
# Handle a list of authors (separated by 'and').
kpeter@743
   174
# It gives back an array of the follwing values:
kpeter@743
   175
#  - num: the number of authors,
kpeter@743
   176
#  - list: the list of the author names,
kpeter@743
   177
#  - text: the bibtex text (separated by commas and/or 'and')
kpeter@743
   178
#  - abbrev: abbreviation that can be used for indicate the
kpeter@743
   179
#    bibliography entries
kpeter@743
   180
#
kpeter@743
   181
def bibtexauthor(data):
kpeter@743
   182
    result = {}
kpeter@743
   183
    bibtex = ''
kpeter@743
   184
    result['list'] = author_rex.split(data)
kpeter@743
   185
    result['num'] = len(result['list'])
kpeter@743
   186
    for i, author in enumerate(result['list']):
kpeter@743
   187
        # general transformations
kpeter@743
   188
        author = latexreplacements(removebraces(author.strip()))
kpeter@743
   189
        # transform "Xyz, A. B." to "A. B. Xyz"
kpeter@743
   190
        pos = author.find(',')
kpeter@743
   191
        if pos != -1:
kpeter@743
   192
            author = author[pos+1:].strip() + ' ' + author[:pos].strip()
kpeter@743
   193
        result['list'][i] = author
kpeter@743
   194
        bibtex += author + '#'
kpeter@743
   195
    bibtex = bibtex[:-1]
kpeter@743
   196
    if result['num'] > 1:
kpeter@743
   197
        ix = bibtex.rfind('#')
kpeter@743
   198
        if result['num'] == 2:
kpeter@743
   199
            bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
kpeter@743
   200
        else:
kpeter@743
   201
            bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
kpeter@743
   202
    bibtex = bibtex.replace('#', ', ')
kpeter@743
   203
    result['text'] = bibtex
kpeter@743
   204
    
kpeter@743
   205
    result['abbrev'] = ''
kpeter@743
   206
    for author in result['list']:
kpeter@743
   207
        pos = author.rfind(' ') + 1
kpeter@743
   208
        count = 1
kpeter@743
   209
        if result['num'] == 1:
kpeter@743
   210
            count = 3
kpeter@743
   211
        result['abbrev'] += copychars(author, pos, count)
kpeter@743
   212
kpeter@743
   213
    return result
kpeter@743
   214
kpeter@743
   215
kpeter@743
   216
#
kpeter@743
   217
# data = title string
kpeter@743
   218
# @return the capitalized title (first letter is capitalized), rest are capitalized
kpeter@743
   219
# only if capitalized inside braces
kpeter@743
   220
#
kpeter@743
   221
def capitalizetitle(data):
kpeter@743
   222
    title_list = capitalize_rex.split(data)
kpeter@743
   223
    title = ''
kpeter@743
   224
    count = 0
kpeter@743
   225
    for phrase in title_list:
kpeter@743
   226
         check = string.lstrip(phrase)
kpeter@743
   227
kpeter@743
   228
         # keep phrase's capitalization the same
kpeter@743
   229
         if check.find('{') == 0:
kpeter@743
   230
              title += removebraces(phrase)
kpeter@743
   231
         else:
kpeter@743
   232
         # first word --> capitalize first letter (after spaces)
kpeter@743
   233
              if count == 0:
kpeter@743
   234
                  title += check.capitalize()
kpeter@743
   235
              else:
kpeter@743
   236
                  title += phrase.lower()
kpeter@743
   237
         count = count + 1
kpeter@743
   238
kpeter@743
   239
    return title
kpeter@743
   240
kpeter@743
   241
kpeter@743
   242
#
kpeter@743
   243
# @return the bibtex for the title
kpeter@743
   244
# @param data --> title string
kpeter@743
   245
# braces are removed from title
kpeter@743
   246
#
kpeter@743
   247
def bibtextitle(data, entrytype):
kpeter@743
   248
    if entrytype in ('book', 'inbook'):
kpeter@743
   249
        title = removebraces(data.strip())
kpeter@743
   250
    else:
kpeter@743
   251
        title = removebraces(capitalizetitle(data.strip()))
kpeter@743
   252
    bibtex = title
kpeter@743
   253
    return bibtex
kpeter@743
   254
kpeter@743
   255
kpeter@743
   256
#
kpeter@743
   257
# function to compare entry lists
kpeter@743
   258
#
kpeter@743
   259
def entry_cmp(x, y):
kpeter@743
   260
    return cmp(x[0], y[0])
kpeter@743
   261
kpeter@743
   262
kpeter@743
   263
#
kpeter@743
   264
# print the XML for the transformed "filecont_source"
kpeter@743
   265
#
kpeter@743
   266
def bibtexdecoder(filecont_source):
kpeter@743
   267
    filecont = []
kpeter@743
   268
    file = []
kpeter@743
   269
    
kpeter@743
   270
    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
kpeter@743
   271
    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
kpeter@743
   272
    endtype_rex = re.compile('}\s*$')
kpeter@743
   273
    endtag_rex = re.compile('^\s*}\s*$')
kpeter@743
   274
kpeter@743
   275
    bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@743
   276
    bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
kpeter@743
   277
kpeter@743
   278
    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@743
   279
    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
kpeter@743
   280
kpeter@743
   281
    for line in filecont_source:
kpeter@743
   282
        line = line[:-1]
kpeter@743
   283
kpeter@743
   284
        # encode character entities
kpeter@743
   285
        line = string.replace(line, '&', '&amp;')
kpeter@743
   286
        line = string.replace(line, '<', '&lt;')
kpeter@743
   287
        line = string.replace(line, '>', '&gt;')
kpeter@743
   288
kpeter@743
   289
        # start entry: publication type (store for later use)
kpeter@743
   290
        if pubtype_rex.match(line):
kpeter@743
   291
        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
kpeter@743
   292
            entrycont = {}
kpeter@743
   293
            entry = []
kpeter@743
   294
            entrytype = pubtype_rex.sub('\g<1>',line)
kpeter@743
   295
            entrytype = string.lower(entrytype)
kpeter@745
   296
            entryid   = pubtype_rex.sub('\g<2>', line)
kpeter@743
   297
kpeter@743
   298
        # end entry if just a }
kpeter@743
   299
        elif endtype_rex.match(line):
kpeter@743
   300
            # generate doxygen code for the entry
kpeter@743
   301
kpeter@743
   302
            # enty type related formattings
kpeter@743
   303
            if entrytype in ('book', 'inbook'):
kpeter@743
   304
                entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
kpeter@743
   305
                if not entrycont.has_key('author'):
kpeter@743
   306
                    entrycont['author'] = entrycont['editor']
kpeter@743
   307
                    entrycont['author']['text'] += ', editors'
kpeter@743
   308
            elif entrytype == 'article':
kpeter@743
   309
                entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
kpeter@743
   310
            elif entrytype in ('inproceedings', 'incollection', 'conference'):
kpeter@743
   311
                entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
kpeter@743
   312
            elif entrytype == 'techreport':
kpeter@743
   313
                if not entrycont.has_key('type'):
kpeter@743
   314
                    entrycont['type'] = 'Technical report'
kpeter@743
   315
            elif entrytype == 'mastersthesis':
kpeter@743
   316
                entrycont['type'] = 'Master\'s thesis'
kpeter@743
   317
            elif entrytype == 'phdthesis':
kpeter@743
   318
                entrycont['type'] = 'PhD thesis'
kpeter@743
   319
kpeter@743
   320
            for eline in entrycont:
kpeter@743
   321
                if eline != '':
kpeter@743
   322
                    eline = latexreplacements(eline)
kpeter@743
   323
kpeter@743
   324
            if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@743
   325
                entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
kpeter@743
   326
kpeter@743
   327
            if entrycont.has_key('author') and (entrycont['author'] != ''):
kpeter@743
   328
                entry.append(entrycont['author']['text'] + '.')
kpeter@743
   329
            if entrycont.has_key('title') and (entrycont['title'] != ''):
kpeter@743
   330
                entry.append(entrycont['title'] + '.')
kpeter@743
   331
            if entrycont.has_key('journal') and (entrycont['journal'] != ''):
kpeter@743
   332
                entry.append(entrycont['journal'] + ',')
kpeter@743
   333
            if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
kpeter@743
   334
                entry.append('In ' + entrycont['booktitle'] + ',')
kpeter@743
   335
            if entrycont.has_key('type') and (entrycont['type'] != ''):
kpeter@743
   336
                eline = entrycont['type']
kpeter@743
   337
                if entrycont.has_key('number') and (entrycont['number'] != ''):
kpeter@743
   338
                    eline += ' ' + entrycont['number']
kpeter@743
   339
                eline += ','
kpeter@743
   340
                entry.append(eline)
kpeter@743
   341
            if entrycont.has_key('institution') and (entrycont['institution'] != ''):
kpeter@743
   342
                entry.append(entrycont['institution'] + ',')
kpeter@743
   343
            if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
kpeter@743
   344
                entry.append(entrycont['publisher'] + ',')
kpeter@743
   345
            if entrycont.has_key('school') and (entrycont['school'] != ''):
kpeter@743
   346
                entry.append(entrycont['school'] + ',')
kpeter@743
   347
            if entrycont.has_key('address') and (entrycont['address'] != ''):
kpeter@743
   348
                entry.append(entrycont['address'] + ',')
kpeter@743
   349
            if entrycont.has_key('edition') and (entrycont['edition'] != ''):
kpeter@743
   350
                entry.append(entrycont['edition'] + ' edition,')
kpeter@743
   351
            if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
kpeter@743
   352
                entry.append(entrycont['howpublished'] + ',')
kpeter@743
   353
            if entrycont.has_key('volume') and (entrycont['volume'] != ''):
kpeter@743
   354
                eline = entrycont['volume'];
kpeter@743
   355
                if entrycont.has_key('number') and (entrycont['number'] != ''):
kpeter@743
   356
                    eline += '(' + entrycont['number'] + ')'
kpeter@743
   357
                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@743
   358
                    eline += ':' + entrycont['pages']
kpeter@743
   359
                eline += ','
kpeter@743
   360
                entry.append(eline)
kpeter@743
   361
            else:
kpeter@743
   362
                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@743
   363
                    entry.append('pages ' + entrycont['pages'] + ',')
kpeter@743
   364
            if entrycont.has_key('year') and (entrycont['year'] != ''):
kpeter@743
   365
                if entrycont.has_key('month') and (entrycont['month'] != ''):
kpeter@743
   366
                    entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
kpeter@743
   367
                else:
kpeter@743
   368
                    entry.append(entrycont['year'] + '.')
kpeter@743
   369
            if entrycont.has_key('note') and (entrycont['note'] != ''):
kpeter@743
   370
                entry.append(entrycont['note'] + '.')
kpeter@754
   371
            if entrycont.has_key('url') and (entrycont['url'] != ''):
kpeter@754
   372
                entry.append(entrycont['url'] + '.')
kpeter@743
   373
kpeter@743
   374
            # generate keys for sorting and for the output
kpeter@743
   375
            sortkey = ''
kpeter@743
   376
            bibkey = ''
kpeter@743
   377
            if entrycont.has_key('author'):
kpeter@743
   378
                for author in entrycont['author']['list']:
kpeter@743
   379
                    sortkey += copychars(author, author.rfind(' ')+1, len(author))
kpeter@743
   380
                bibkey = entrycont['author']['abbrev']
kpeter@743
   381
            else:
kpeter@743
   382
                bibkey = 'x'
kpeter@743
   383
            if entrycont.has_key('year'):
kpeter@743
   384
                sortkey += entrycont['year']
kpeter@743
   385
                bibkey += entrycont['year'][-2:]
kpeter@743
   386
            if entrycont.has_key('title'):
kpeter@743
   387
                sortkey += entrycont['title']
kpeter@743
   388
            if entrycont.has_key('key'):
kpeter@743
   389
                sortkey = entrycont['key'] + sortkey
kpeter@743
   390
                bibkey = entrycont['key']
kpeter@743
   391
            entry.insert(0, sortkey)
kpeter@743
   392
            entry.insert(1, bibkey)
kpeter@745
   393
            entry.insert(2, entryid)
kpeter@743
   394
           
kpeter@743
   395
            # add the entry to the file contents
kpeter@743
   396
            filecont.append(entry)
kpeter@743
   397
kpeter@743
   398
        else:
kpeter@743
   399
            # field, publication info
kpeter@743
   400
            field = ''
kpeter@743
   401
            data = ''
kpeter@743
   402
            
kpeter@743
   403
            # field = {data} entries
kpeter@743
   404
            if bracedata_rex.match(line):
kpeter@743
   405
                field = bracefield_rex.sub('\g<1>', line)
kpeter@743
   406
                field = string.lower(field)
kpeter@743
   407
                data =  bracedata_rex.sub('\g<2>', line)
kpeter@743
   408
kpeter@743
   409
            # field = "data" entries
kpeter@743
   410
            elif quotedata_rex.match(line):
kpeter@743
   411
                field = quotefield_rex.sub('\g<1>', line)
kpeter@743
   412
                field = string.lower(field)
kpeter@743
   413
                data =  quotedata_rex.sub('\g<2>', line)
kpeter@743
   414
kpeter@743
   415
            # field = data entries
kpeter@743
   416
            elif data_rex.match(line):
kpeter@743
   417
                field = field_rex.sub('\g<1>', line)
kpeter@743
   418
                field = string.lower(field)
kpeter@743
   419
                data =  data_rex.sub('\g<2>', line)
kpeter@754
   420
kpeter@754
   421
            if field == 'url':
kpeter@754
   422
                data = '\\url{' + data.strip() + '}'
kpeter@743
   423
            
kpeter@743
   424
            if field in ('author', 'editor'):
kpeter@743
   425
                entrycont[field] = bibtexauthor(data)
kpeter@743
   426
                line = ''
kpeter@743
   427
            elif field == 'title':
kpeter@743
   428
                line = bibtextitle(data, entrytype)
kpeter@743
   429
            elif field != '':
kpeter@743
   430
                line = removebraces(transformurls(data.strip()))
kpeter@743
   431
kpeter@743
   432
            if line != '':
kpeter@743
   433
                line = latexreplacements(line)
kpeter@743
   434
                entrycont[field] = line
kpeter@743
   435
kpeter@743
   436
kpeter@743
   437
    # sort entries
kpeter@743
   438
    filecont.sort(entry_cmp)
kpeter@743
   439
    
kpeter@743
   440
    # count the bibtex keys
kpeter@743
   441
    keytable = {}
kpeter@743
   442
    counttable = {}
kpeter@743
   443
    for entry in filecont:
kpeter@743
   444
        bibkey = entry[1]
kpeter@743
   445
        if not keytable.has_key(bibkey):
kpeter@743
   446
            keytable[bibkey] = 1
kpeter@743
   447
        else:
kpeter@743
   448
            keytable[bibkey] += 1
kpeter@743
   449
kpeter@743
   450
    for bibkey in keytable.keys():
kpeter@743
   451
        counttable[bibkey] = 0
kpeter@743
   452
    
kpeter@743
   453
    # generate output
kpeter@743
   454
    for entry in filecont:
kpeter@743
   455
        # generate output key form the bibtex key
kpeter@743
   456
        bibkey = entry[1]
kpeter@745
   457
        entryid = entry[2]
kpeter@743
   458
        if keytable[bibkey] == 1:
kpeter@743
   459
            outkey = bibkey
kpeter@743
   460
        else:
kpeter@743
   461
            outkey = bibkey + chr(97 + counttable[bibkey])
kpeter@743
   462
        counttable[bibkey] += 1
kpeter@743
   463
        
kpeter@743
   464
        # append the entry code to the output
kpeter@745
   465
        file.append('\\section ' + entryid + ' [' + outkey + ']')
kpeter@745
   466
        file.append('<div style="' + divstyle + '">')
kpeter@745
   467
        for line in entry[3:]:
kpeter@743
   468
            file.append(line)
kpeter@745
   469
        file.append('</div>')
kpeter@743
   470
        file.append('')
kpeter@743
   471
kpeter@743
   472
    return file
kpeter@743
   473
kpeter@743
   474
kpeter@743
   475
#
kpeter@743
   476
# return 1 iff abbr is in line but not inside braces or quotes
kpeter@743
   477
# assumes that abbr appears only once on the line (out of braces and quotes)
kpeter@743
   478
#
kpeter@743
   479
def verify_out_of_braces(line, abbr):
kpeter@743
   480
kpeter@743
   481
    phrase_split = delimiter_rex.split(line)
kpeter@743
   482
kpeter@743
   483
    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
kpeter@743
   484
kpeter@743
   485
    open_brace = 0
kpeter@743
   486
    open_quote = 0
kpeter@743
   487
kpeter@743
   488
    for phrase in phrase_split:
kpeter@743
   489
        if phrase == "{":
kpeter@743
   490
            open_brace = open_brace + 1
kpeter@743
   491
        elif phrase == "}":
kpeter@743
   492
            open_brace = open_brace - 1
kpeter@743
   493
        elif phrase == '"':
kpeter@743
   494
            if open_quote == 1:
kpeter@743
   495
                open_quote = 0
kpeter@743
   496
            else:
kpeter@743
   497
                open_quote = 1
kpeter@743
   498
        elif abbr_rex.search(phrase):
kpeter@743
   499
            if open_brace == 0 and open_quote == 0:
kpeter@743
   500
                return 1
kpeter@743
   501
kpeter@743
   502
    return 0
kpeter@743
   503
kpeter@743
   504
kpeter@743
   505
#
kpeter@743
   506
# a line in the form phrase1 # phrase2 # ... # phrasen
kpeter@743
   507
# is returned as phrase1 phrase2 ... phrasen
kpeter@743
   508
# with the correct punctuation
kpeter@743
   509
# Bug: Doesn't always work with multiple abbreviations plugged in
kpeter@743
   510
#
kpeter@743
   511
def concat_line(line):
kpeter@743
   512
    # only look at part after equals
kpeter@743
   513
    field = field_rex.sub('\g<1>',line)
kpeter@743
   514
    rest = field_rex.sub('\g<2>',line)
kpeter@743
   515
kpeter@743
   516
    concat_line = field + ' ='
kpeter@743
   517
kpeter@743
   518
    pound_split = concatsplit_rex.split(rest)
kpeter@743
   519
kpeter@743
   520
    phrase_count = 0
kpeter@743
   521
    length = len(pound_split)
kpeter@743
   522
kpeter@743
   523
    for phrase in pound_split:
kpeter@743
   524
        phrase = phrase.strip()
kpeter@743
   525
        if phrase_count != 0:
kpeter@743
   526
            if phrase.startswith('"') or phrase.startswith('{'):
kpeter@743
   527
                phrase = phrase[1:]
kpeter@743
   528
        elif phrase.startswith('"'):
kpeter@743
   529
            phrase = phrase.replace('"','{',1)
kpeter@743
   530
kpeter@743
   531
        if phrase_count != length-1:
kpeter@743
   532
            if phrase.endswith('"') or phrase.endswith('}'):
kpeter@743
   533
                phrase = phrase[:-1]
kpeter@743
   534
        else:
kpeter@743
   535
            if phrase.endswith('"'):
kpeter@743
   536
                phrase = phrase[:-1]
kpeter@743
   537
                phrase = phrase + "}"
kpeter@743
   538
            elif phrase.endswith('",'):
kpeter@743
   539
                phrase = phrase[:-2]
kpeter@743
   540
                phrase = phrase + "},"
kpeter@743
   541
kpeter@743
   542
        # if phrase did have \#, add the \# back
kpeter@743
   543
        if phrase.endswith('\\'):
kpeter@743
   544
            phrase = phrase + "#"
kpeter@743
   545
        concat_line = concat_line + ' ' + phrase
kpeter@743
   546
kpeter@743
   547
        phrase_count = phrase_count + 1
kpeter@743
   548
kpeter@743
   549
    return concat_line
kpeter@743
   550
kpeter@743
   551
kpeter@743
   552
#
kpeter@743
   553
# substitute abbreviations into filecont
kpeter@743
   554
# @param filecont_source - string of data from file
kpeter@743
   555
#
kpeter@743
   556
def bibtex_replace_abbreviations(filecont_source):
kpeter@743
   557
    filecont = filecont_source.splitlines()
kpeter@743
   558
kpeter@743
   559
    #  These are defined in bibtex, so we'll define them too
kpeter@743
   560
    abbr_list = ['jan','feb','mar','apr','may','jun',
kpeter@743
   561
                 'jul','aug','sep','oct','nov','dec']
kpeter@743
   562
    value_list = ['January','February','March','April',
kpeter@743
   563
                  'May','June','July','August','September',
kpeter@743
   564
                  'October','November','December']
kpeter@743
   565
kpeter@743
   566
    abbr_rex = []
kpeter@743
   567
    total_abbr_count = 0
kpeter@743
   568
kpeter@743
   569
    front = '\\b'
kpeter@743
   570
    back = '(,?)\\b'
kpeter@743
   571
kpeter@743
   572
    for x in abbr_list:
kpeter@743
   573
        abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
kpeter@743
   574
        total_abbr_count = total_abbr_count + 1
kpeter@743
   575
kpeter@743
   576
kpeter@743
   577
    abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
kpeter@743
   578
                             re.I)
kpeter@743
   579
kpeter@743
   580
    comment_rex = re.compile('@comment\s*{',re.I)
kpeter@743
   581
    preamble_rex = re.compile('@preamble\s*{',re.I)
kpeter@743
   582
kpeter@743
   583
    waiting_for_end_string = 0
kpeter@743
   584
    i = 0
kpeter@743
   585
    filecont2 = ''
kpeter@743
   586
kpeter@743
   587
    for line in filecont:
kpeter@743
   588
        if line == ' ' or line == '':
kpeter@743
   589
            continue
kpeter@743
   590
kpeter@743
   591
        if waiting_for_end_string:
kpeter@743
   592
            if re.search('}',line):
kpeter@743
   593
                waiting_for_end_string = 0
kpeter@743
   594
                continue
kpeter@743
   595
kpeter@743
   596
        if abbrdef_rex.search(line):
kpeter@743
   597
            abbr = abbrdef_rex.sub('\g<1>', line)
kpeter@743
   598
kpeter@743
   599
            if abbr_list.count(abbr) == 0:
kpeter@743
   600
                val = abbrdef_rex.sub('\g<2>', line)
kpeter@743
   601
                abbr_list.append(abbr)
kpeter@743
   602
                value_list.append(string.strip(val))
kpeter@743
   603
                abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
kpeter@743
   604
                total_abbr_count = total_abbr_count + 1
kpeter@743
   605
            waiting_for_end_string = 1
kpeter@743
   606
            continue
kpeter@743
   607
kpeter@743
   608
        if comment_rex.search(line):
kpeter@743
   609
            waiting_for_end_string = 1
kpeter@743
   610
            continue
kpeter@743
   611
kpeter@743
   612
        if preamble_rex.search(line):
kpeter@743
   613
            waiting_for_end_string = 1
kpeter@743
   614
            continue
kpeter@743
   615
kpeter@743
   616
kpeter@743
   617
        # replace subsequent abbreviations with the value
kpeter@743
   618
        abbr_count = 0
kpeter@743
   619
kpeter@743
   620
        for x in abbr_list:
kpeter@743
   621
kpeter@743
   622
            if abbr_rex[abbr_count].search(line):
kpeter@743
   623
                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
kpeter@743
   624
                    line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
kpeter@743
   625
                # Check for # concatenations
kpeter@743
   626
                if concatsplit_rex.search(line):
kpeter@743
   627
                    line = concat_line(line)
kpeter@743
   628
            abbr_count = abbr_count + 1
kpeter@743
   629
kpeter@743
   630
kpeter@743
   631
        filecont2 = filecont2 + line + '\n'
kpeter@743
   632
        i = i+1
kpeter@743
   633
kpeter@743
   634
kpeter@743
   635
    # Do one final pass over file
kpeter@743
   636
kpeter@743
   637
    # make sure that didn't end up with {" or }" after the substitution
kpeter@743
   638
    filecont2 = filecont2.replace('{"','{{')
kpeter@743
   639
    filecont2 = filecont2.replace('"}','}}')
kpeter@743
   640
kpeter@743
   641
    afterquotevalue_rex = re.compile('"\s*,\s*')
kpeter@743
   642
    afterbrace_rex = re.compile('"\s*}')
kpeter@743
   643
    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
kpeter@743
   644
kpeter@743
   645
    # add new lines to data that changed because of abbreviation substitutions
kpeter@743
   646
    filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
kpeter@743
   647
    filecont2 = afterbrace_rex.sub('"\n}', filecont2)
kpeter@743
   648
    filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
kpeter@743
   649
kpeter@743
   650
    return filecont2
kpeter@743
   651
kpeter@743
   652
#
kpeter@743
   653
# convert @type( ... ) to @type{ ... }
kpeter@743
   654
#
kpeter@743
   655
def no_outer_parens(filecont):
kpeter@743
   656
kpeter@743
   657
    # do checking for open parens
kpeter@743
   658
    # will convert to braces
kpeter@743
   659
    paren_split = re.split('([(){}])',filecont)
kpeter@743
   660
kpeter@743
   661
    open_paren_count = 0
kpeter@743
   662
    open_type = 0
kpeter@743
   663
    look_next = 0
kpeter@743
   664
kpeter@743
   665
    # rebuild filecont
kpeter@743
   666
    filecont = ''
kpeter@743
   667
kpeter@743
   668
    at_rex = re.compile('@\w*')
kpeter@743
   669
kpeter@743
   670
    for phrase in paren_split:
kpeter@743
   671
        if look_next == 1:
kpeter@743
   672
            if phrase == '(':
kpeter@743
   673
                phrase = '{'
kpeter@743
   674
                open_paren_count = open_paren_count + 1
kpeter@743
   675
            else:
kpeter@743
   676
                open_type = 0
kpeter@743
   677
            look_next = 0
kpeter@743
   678
kpeter@743
   679
        if phrase == '(':
kpeter@743
   680
            open_paren_count = open_paren_count + 1
kpeter@743
   681
kpeter@743
   682
        elif phrase == ')':
kpeter@743
   683
            open_paren_count = open_paren_count - 1
kpeter@743
   684
            if open_type == 1 and open_paren_count == 0:
kpeter@743
   685
                phrase = '}'
kpeter@743
   686
                open_type = 0
kpeter@743
   687
kpeter@743
   688
        elif at_rex.search( phrase ):
kpeter@743
   689
            open_type = 1
kpeter@743
   690
            look_next = 1
kpeter@743
   691
kpeter@743
   692
        filecont = filecont + phrase
kpeter@743
   693
kpeter@743
   694
    return filecont
kpeter@743
   695
kpeter@743
   696
kpeter@743
   697
#
kpeter@743
   698
# make all whitespace into just one space
kpeter@743
   699
# format the bibtex file into a usable form.
kpeter@743
   700
#
kpeter@743
   701
def bibtexwasher(filecont_source):
kpeter@743
   702
kpeter@743
   703
    space_rex = re.compile('\s+')
kpeter@743
   704
    comment_rex = re.compile('\s*%')
kpeter@743
   705
kpeter@743
   706
    filecont = []
kpeter@743
   707
kpeter@743
   708
    # remove trailing and excessive whitespace
kpeter@743
   709
    # ignore comments
kpeter@743
   710
    for line in filecont_source:
kpeter@743
   711
        line = string.strip(line)
kpeter@743
   712
        line = space_rex.sub(' ', line)
kpeter@743
   713
        # ignore comments
kpeter@743
   714
        if not comment_rex.match(line) and line != '':
kpeter@743
   715
            filecont.append(' '+ line)
kpeter@743
   716
kpeter@743
   717
    filecont = string.join(filecont, '')
kpeter@743
   718
kpeter@743
   719
    # the file is in one long string
kpeter@743
   720
kpeter@743
   721
    filecont = no_outer_parens(filecont)
kpeter@743
   722
kpeter@743
   723
    #
kpeter@743
   724
    # split lines according to preferred syntax scheme
kpeter@743
   725
    #
kpeter@743
   726
    filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
kpeter@743
   727
kpeter@743
   728
    # add new lines after commas that are after values
kpeter@743
   729
    filecont = re.sub('"\s*,', '",\n', filecont)
kpeter@743
   730
    filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
kpeter@743
   731
    filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
kpeter@743
   732
                          '\n\n\g<1>\g<2>,\n', filecont)
kpeter@743
   733
kpeter@743
   734
    # add new lines after }
kpeter@743
   735
    filecont = re.sub('"\s*}','"\n}\n', filecont)
kpeter@743
   736
    filecont = re.sub('}\s*,','},\n', filecont)
kpeter@743
   737
kpeter@743
   738
kpeter@743
   739
    filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
kpeter@743
   740
kpeter@743
   741
    # character encoding, reserved latex characters
kpeter@743
   742
    filecont = re.sub('{\\\&}', '&', filecont)
kpeter@743
   743
    filecont = re.sub('\\\&', '&', filecont)
kpeter@743
   744
kpeter@743
   745
    # do checking for open braces to get format correct
kpeter@743
   746
    open_brace_count = 0
kpeter@743
   747
    brace_split = re.split('([{}])',filecont)
kpeter@743
   748
kpeter@743
   749
    # rebuild filecont
kpeter@743
   750
    filecont = ''
kpeter@743
   751
kpeter@743
   752
    for phrase in brace_split:
kpeter@743
   753
        if phrase == '{':
kpeter@743
   754
            open_brace_count = open_brace_count + 1
kpeter@743
   755
        elif phrase == '}':
kpeter@743
   756
            open_brace_count = open_brace_count - 1
kpeter@743
   757
            if open_brace_count == 0:
kpeter@743
   758
                filecont = filecont + '\n'
kpeter@743
   759
kpeter@743
   760
        filecont = filecont + phrase
kpeter@743
   761
kpeter@743
   762
    filecont2 = bibtex_replace_abbreviations(filecont)
kpeter@743
   763
kpeter@743
   764
    # gather
kpeter@743
   765
    filecont = filecont2.splitlines()
kpeter@743
   766
    i=0
kpeter@743
   767
    j=0         # count the number of blank lines
kpeter@743
   768
    for line in filecont:
kpeter@743
   769
        # ignore blank lines
kpeter@743
   770
        if line == '' or line == ' ':
kpeter@743
   771
            j = j+1
kpeter@743
   772
            continue
kpeter@743
   773
        filecont[i] = line + '\n'
kpeter@743
   774
        i = i+1
kpeter@743
   775
kpeter@743
   776
    # get rid of the extra stuff at the end of the array
kpeter@743
   777
    # (The extra stuff are duplicates that are in the array because
kpeter@743
   778
    # blank lines were removed.)
kpeter@743
   779
    length = len( filecont)
kpeter@743
   780
    filecont[length-j:length] = []
kpeter@743
   781
kpeter@743
   782
    return filecont
kpeter@743
   783
kpeter@743
   784
kpeter@743
   785
def filehandler(filepath):
kpeter@743
   786
    try:
kpeter@743
   787
        fd = open(filepath, 'r')
kpeter@743
   788
        filecont_source = fd.readlines()
kpeter@743
   789
        fd.close()
kpeter@743
   790
    except:
kpeter@743
   791
        print 'Could not open file:', filepath
kpeter@743
   792
    washeddata = bibtexwasher(filecont_source)
kpeter@743
   793
    outdata = bibtexdecoder(washeddata)
kpeter@743
   794
    print '/**'
kpeter@743
   795
    print '\page references References'
kpeter@743
   796
    print
kpeter@743
   797
    for line in outdata:
kpeter@743
   798
        print line
kpeter@743
   799
    print '*/'
kpeter@743
   800
kpeter@743
   801
kpeter@743
   802
# main program
kpeter@743
   803
kpeter@743
   804
def main():
kpeter@743
   805
    import sys
kpeter@743
   806
    if sys.argv[1:]:
kpeter@743
   807
        filepath = sys.argv[1]
kpeter@743
   808
    else:
kpeter@743
   809
        print "No input file"
kpeter@743
   810
        sys.exit()
kpeter@743
   811
    filehandler(filepath)
kpeter@743
   812
kpeter@743
   813
if __name__ == "__main__": main()
kpeter@743
   814
kpeter@743
   815
kpeter@743
   816
# end python script