scripts/bib2dox.py
author Peter Kovacs <kpeter@inf.elte.hu>
Tue, 15 Mar 2011 19:32:21 +0100
changeset 936 ddd3c0d3d9bf
parent 754 2de0fc630899
child 1052 eb2f9d453070
permissions -rwxr-xr-x
Implement the scaling Price Refinement heuristic in CostScaling (#417)
instead of Early Termination.

These two heuristics are similar, but the newer one is faster
and not only makes it possible to skip some epsilon phases, but
it can improve the performance of the other phases, as well.
kpeter@836
     1
#! /usr/bin/env python
kpeter@743
     2
"""
kpeter@743
     3
  BibTeX to Doxygen converter
kpeter@743
     4
  Usage: python bib2dox.py bibfile.bib > bibfile.dox
kpeter@743
     5
kpeter@836
     6
  This file is a part of LEMON, a generic C++ optimization library.
kpeter@836
     7
kpeter@836
     8
  **********************************************************************
kpeter@836
     9
kpeter@743
    10
  This code is the modification of the BibTeX to XML converter
kpeter@836
    11
  by Vidar Bronken Gundersen et al.
kpeter@836
    12
  See the original copyright notices below. 
kpeter@743
    13
kpeter@743
    14
  **********************************************************************
kpeter@743
    15
kpeter@743
    16
  Decoder for bibliographic data, BibTeX
kpeter@743
    17
  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
kpeter@743
    18
kpeter@743
    19
  v.8
kpeter@743
    20
  (c)2002-06-23 Vidar Bronken Gundersen
kpeter@743
    21
  http://bibtexml.sf.net/
kpeter@743
    22
  Reuse approved as long as this notification is kept.
kpeter@743
    23
  Licence: GPL.
kpeter@743
    24
kpeter@743
    25
  Contributions/thanks to:
kpeter@743
    26
  Egon Willighagen, http://sf.net/projects/jreferences/
kpeter@743
    27
  Richard Mahoney (for providing a test case)
kpeter@743
    28
kpeter@743
    29
  Editted by Sara Sprenkle to be more robust and handle more bibtex features.
kpeter@743
    30
  (c) 2003-01-15
kpeter@743
    31
kpeter@743
    32
  1.  Changed bibtex: tags to bibxml: tags.
kpeter@743
    33
  2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
kpeter@743
    34
  3.  Allow spaces between @type and first {
kpeter@743
    35
  4.  "author" fields with multiple authors split by " and "
kpeter@743
    36
      are put in separate xml "bibxml:author" tags.
kpeter@743
    37
  5.  Option for Titles: words are capitalized
kpeter@743
    38
      only if first letter in title or capitalized inside braces
kpeter@743
    39
  6.  Removes braces from within field values
kpeter@743
    40
  7.  Ignores comments in bibtex file (including @comment{ or % )
kpeter@743
    41
  8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
kpeter@743
    42
  9.  Handles bibtex @string abbreviations
kpeter@743
    43
        --> includes bibtex's default abbreviations for months
kpeter@743
    44
        --> does concatenation of abbr # " more " and " more " # abbr
kpeter@743
    45
  10. Handles @type( ... ) or @type{ ... }
kpeter@743
    46
  11. The keywords field is split on , or ; and put into separate xml
kpeter@743
    47
      "bibxml:keywords" tags
kpeter@743
    48
  12. Ignores @preamble
kpeter@743
    49
kpeter@743
    50
  Known Limitations
kpeter@743
    51
  1.  Does not transform Latex encoding like math mode and special
kpeter@743
    52
      latex symbols.
kpeter@743
    53
  2.  Does not parse author fields into first and last names.
kpeter@743
    54
      E.g., It does not do anything special to an author whose name is
kpeter@743
    55
      in the form LAST_NAME, FIRST_NAME
kpeter@743
    56
      In "author" tag, will show up as
kpeter@743
    57
      <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
kpeter@743
    58
  3.  Does not handle "crossref" fields other than to print
kpeter@743
    59
      <bibxml:crossref>...</bibxml:crossref>
kpeter@743
    60
  4.  Does not inform user of the input's format errors.  You just won't
kpeter@743
    61
      be able to transform the file later with XSL
kpeter@743
    62
kpeter@743
    63
  You will have to manually edit the XML output if you need to handle
kpeter@743
    64
  these (and unknown) limitations.
kpeter@743
    65
kpeter@743
    66
"""
kpeter@743
    67
kpeter@743
    68
import string, re
kpeter@743
    69
kpeter@743
    70
# set of valid name characters
kpeter@743
    71
valid_name_chars = '[\w\-:]'
kpeter@743
    72
kpeter@743
    73
#
kpeter@743
    74
# define global regular expression variables
kpeter@743
    75
#
kpeter@743
    76
author_rex = re.compile('\s+and\s+')
kpeter@743
    77
rembraces_rex = re.compile('[{}]')
kpeter@754
    78
capitalize_rex = re.compile('({[^}]*})')
kpeter@743
    79
kpeter@743
    80
# used by bibtexkeywords(data)
kpeter@743
    81
keywords_rex = re.compile('[,;]')
kpeter@743
    82
kpeter@743
    83
# used by concat_line(line)
kpeter@743
    84
concatsplit_rex = re.compile('\s*#\s*')
kpeter@743
    85
kpeter@743
    86
# split on {, }, or " in verify_out_of_braces
kpeter@743
    87
delimiter_rex = re.compile('([{}"])',re.I)
kpeter@743
    88
kpeter@743
    89
field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@743
    90
data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
kpeter@743
    91
kpeter@743
    92
url_rex = re.compile('\\\url\{([^}]*)\}')
kpeter@743
    93
kpeter@745
    94
#
kpeter@745
    95
# styles for html formatting
kpeter@745
    96
#
kpeter@745
    97
divstyle = 'margin-top: -4ex; margin-left: 8em;'
kpeter@743
    98
kpeter@743
    99
#
kpeter@743
   100
# return the string parameter without braces
kpeter@743
   101
#
kpeter@743
   102
def transformurls(str):
kpeter@743
   103
    return url_rex.sub(r'<a href="\1">\1</a>', str)
kpeter@743
   104
kpeter@743
   105
#
kpeter@743
   106
# return the string parameter without braces
kpeter@743
   107
#
kpeter@743
   108
def removebraces(str):
kpeter@743
   109
    return rembraces_rex.sub('', str)
kpeter@743
   110
kpeter@743
   111
#
kpeter@743
   112
# latex-specific replacements
kpeter@743
   113
# (do this after braces were removed)
kpeter@743
   114
#
kpeter@743
   115
def latexreplacements(line):
kpeter@743
   116
    line = string.replace(line, '~', '&nbsp;')
kpeter@743
   117
    line = string.replace(line, '\\\'a', '&aacute;')
kpeter@743
   118
    line = string.replace(line, '\\"a', '&auml;')
kpeter@743
   119
    line = string.replace(line, '\\\'e', '&eacute;')
kpeter@743
   120
    line = string.replace(line, '\\"e', '&euml;')
kpeter@743
   121
    line = string.replace(line, '\\\'i', '&iacute;')
kpeter@743
   122
    line = string.replace(line, '\\"i', '&iuml;')
kpeter@743
   123
    line = string.replace(line, '\\\'o', '&oacute;')
kpeter@743
   124
    line = string.replace(line, '\\"o', '&ouml;')
kpeter@743
   125
    line = string.replace(line, '\\\'u', '&uacute;')
kpeter@743
   126
    line = string.replace(line, '\\"u', '&uuml;')
kpeter@743
   127
    line = string.replace(line, '\\H o', '&otilde;')
kpeter@743
   128
    line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
kpeter@743
   129
    line = string.replace(line, '\\\'A', '&Aacute;')
kpeter@743
   130
    line = string.replace(line, '\\"A', '&Auml;')
kpeter@743
   131
    line = string.replace(line, '\\\'E', '&Eacute;')
kpeter@743
   132
    line = string.replace(line, '\\"E', '&Euml;')
kpeter@743
   133
    line = string.replace(line, '\\\'I', '&Iacute;')
kpeter@743
   134
    line = string.replace(line, '\\"I', '&Iuml;')
kpeter@743
   135
    line = string.replace(line, '\\\'O', '&Oacute;')
kpeter@743
   136
    line = string.replace(line, '\\"O', '&Ouml;')
kpeter@743
   137
    line = string.replace(line, '\\\'U', '&Uacute;')
kpeter@743
   138
    line = string.replace(line, '\\"U', '&Uuml;')
kpeter@743
   139
    line = string.replace(line, '\\H O', '&Otilde;')
kpeter@743
   140
    line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
kpeter@743
   141
kpeter@743
   142
    return line
kpeter@743
   143
kpeter@743
   144
#
kpeter@743
   145
# copy characters form a string decoding html expressions (&xyz;)
kpeter@743
   146
#
kpeter@743
   147
def copychars(str, ifrom, count):
kpeter@743
   148
    result = ''
kpeter@743
   149
    i = ifrom
kpeter@743
   150
    c = 0
kpeter@743
   151
    html_spec = False
kpeter@743
   152
    while (i < len(str)) and (c < count):
kpeter@743
   153
        if str[i] == '&':
kpeter@743
   154
            html_spec = True;
kpeter@743
   155
            if i+1 < len(str):
kpeter@743
   156
                result += str[i+1]
kpeter@743
   157
            c += 1
kpeter@743
   158
            i += 2
kpeter@743
   159
        else:
kpeter@743
   160
            if not html_spec:
kpeter@743
   161
                if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
kpeter@743
   162
                   ((str[i] >= 'a') and (str[i] <= 'z')):
kpeter@743
   163
                    result += str[i]
kpeter@743
   164
                    c += 1
kpeter@743
   165
            elif str[i] == ';':
kpeter@743
   166
                html_spec = False;
kpeter@743
   167
            i += 1
kpeter@743
   168
    
kpeter@743
   169
    return result
kpeter@743
   170
kpeter@743
   171
kpeter@743
   172
# 
kpeter@743
   173
# Handle a list of authors (separated by 'and').
kpeter@743
   174
# It gives back an array of the follwing values:
kpeter@743
   175
#  - num: the number of authors,
kpeter@743
   176
#  - list: the list of the author names,
kpeter@743
   177
#  - text: the bibtex text (separated by commas and/or 'and')
kpeter@743
   178
#  - abbrev: abbreviation that can be used for indicate the
kpeter@743
   179
#    bibliography entries
kpeter@743
   180
#
kpeter@743
   181
def bibtexauthor(data):
kpeter@743
   182
    result = {}
kpeter@743
   183
    bibtex = ''
kpeter@743
   184
    result['list'] = author_rex.split(data)
kpeter@743
   185
    result['num'] = len(result['list'])
kpeter@743
   186
    for i, author in enumerate(result['list']):
kpeter@743
   187
        # general transformations
kpeter@743
   188
        author = latexreplacements(removebraces(author.strip()))
kpeter@743
   189
        # transform "Xyz, A. B." to "A. B. Xyz"
kpeter@743
   190
        pos = author.find(',')
kpeter@743
   191
        if pos != -1:
kpeter@743
   192
            author = author[pos+1:].strip() + ' ' + author[:pos].strip()
kpeter@743
   193
        result['list'][i] = author
kpeter@743
   194
        bibtex += author + '#'
kpeter@743
   195
    bibtex = bibtex[:-1]
kpeter@743
   196
    if result['num'] > 1:
kpeter@743
   197
        ix = bibtex.rfind('#')
kpeter@743
   198
        if result['num'] == 2:
kpeter@743
   199
            bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
kpeter@743
   200
        else:
kpeter@743
   201
            bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
kpeter@743
   202
    bibtex = bibtex.replace('#', ', ')
kpeter@743
   203
    result['text'] = bibtex
kpeter@743
   204
    
kpeter@743
   205
    result['abbrev'] = ''
kpeter@743
   206
    for author in result['list']:
kpeter@743
   207
        pos = author.rfind(' ') + 1
kpeter@743
   208
        count = 1
kpeter@743
   209
        if result['num'] == 1:
kpeter@743
   210
            count = 3
kpeter@743
   211
        result['abbrev'] += copychars(author, pos, count)
kpeter@743
   212
kpeter@743
   213
    return result
kpeter@743
   214
kpeter@743
   215
kpeter@743
   216
#
kpeter@743
   217
# data = title string
kpeter@743
   218
# @return the capitalized title (first letter is capitalized), rest are capitalized
kpeter@743
   219
# only if capitalized inside braces
kpeter@743
   220
#
kpeter@743
   221
def capitalizetitle(data):
kpeter@743
   222
    title_list = capitalize_rex.split(data)
kpeter@743
   223
    title = ''
kpeter@743
   224
    count = 0
kpeter@743
   225
    for phrase in title_list:
kpeter@743
   226
         check = string.lstrip(phrase)
kpeter@743
   227
kpeter@743
   228
         # keep phrase's capitalization the same
kpeter@743
   229
         if check.find('{') == 0:
kpeter@743
   230
              title += removebraces(phrase)
kpeter@743
   231
         else:
kpeter@743
   232
         # first word --> capitalize first letter (after spaces)
kpeter@743
   233
              if count == 0:
kpeter@743
   234
                  title += check.capitalize()
kpeter@743
   235
              else:
kpeter@743
   236
                  title += phrase.lower()
kpeter@743
   237
         count = count + 1
kpeter@743
   238
kpeter@743
   239
    return title
kpeter@743
   240
kpeter@743
   241
kpeter@743
   242
#
kpeter@743
   243
# @return the bibtex for the title
kpeter@743
   244
# @param data --> title string
kpeter@743
   245
# braces are removed from title
kpeter@743
   246
#
kpeter@743
   247
def bibtextitle(data, entrytype):
kpeter@743
   248
    if entrytype in ('book', 'inbook'):
kpeter@743
   249
        title = removebraces(data.strip())
kpeter@743
   250
    else:
kpeter@743
   251
        title = removebraces(capitalizetitle(data.strip()))
kpeter@743
   252
    bibtex = title
kpeter@743
   253
    return bibtex
kpeter@743
   254
kpeter@743
   255
kpeter@743
   256
#
kpeter@743
   257
# function to compare entry lists
kpeter@743
   258
#
kpeter@743
   259
def entry_cmp(x, y):
kpeter@743
   260
    return cmp(x[0], y[0])
kpeter@743
   261
kpeter@743
   262
kpeter@743
   263
#
kpeter@743
   264
# print the XML for the transformed "filecont_source"
kpeter@743
   265
#
kpeter@743
   266
def bibtexdecoder(filecont_source):
kpeter@743
   267
    filecont = []
kpeter@743
   268
    file = []
kpeter@743
   269
    
kpeter@743
   270
    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
kpeter@743
   271
    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
kpeter@743
   272
    endtype_rex = re.compile('}\s*$')
kpeter@743
   273
    endtag_rex = re.compile('^\s*}\s*$')
kpeter@743
   274
kpeter@743
   275
    bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@743
   276
    bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
kpeter@743
   277
kpeter@743
   278
    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
kpeter@743
   279
    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
kpeter@743
   280
kpeter@743
   281
    for line in filecont_source:
kpeter@743
   282
        line = line[:-1]
kpeter@743
   283
kpeter@743
   284
        # encode character entities
kpeter@743
   285
        line = string.replace(line, '&', '&amp;')
kpeter@743
   286
        line = string.replace(line, '<', '&lt;')
kpeter@743
   287
        line = string.replace(line, '>', '&gt;')
kpeter@743
   288
kpeter@743
   289
        # start entry: publication type (store for later use)
kpeter@743
   290
        if pubtype_rex.match(line):
kpeter@743
   291
        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
kpeter@743
   292
            entrycont = {}
kpeter@743
   293
            entry = []
kpeter@743
   294
            entrytype = pubtype_rex.sub('\g<1>',line)
kpeter@743
   295
            entrytype = string.lower(entrytype)
kpeter@745
   296
            entryid   = pubtype_rex.sub('\g<2>', line)
kpeter@743
   297
kpeter@743
   298
        # end entry if just a }
kpeter@743
   299
        elif endtype_rex.match(line):
kpeter@743
   300
            # generate doxygen code for the entry
kpeter@743
   301
kpeter@743
   302
            # enty type related formattings
kpeter@743
   303
            if entrytype in ('book', 'inbook'):
kpeter@743
   304
                entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
kpeter@743
   305
                if not entrycont.has_key('author'):
kpeter@743
   306
                    entrycont['author'] = entrycont['editor']
kpeter@743
   307
                    entrycont['author']['text'] += ', editors'
kpeter@743
   308
            elif entrytype == 'article':
kpeter@743
   309
                entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
kpeter@743
   310
            elif entrytype in ('inproceedings', 'incollection', 'conference'):
kpeter@743
   311
                entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
kpeter@743
   312
            elif entrytype == 'techreport':
kpeter@743
   313
                if not entrycont.has_key('type'):
kpeter@743
   314
                    entrycont['type'] = 'Technical report'
kpeter@743
   315
            elif entrytype == 'mastersthesis':
kpeter@743
   316
                entrycont['type'] = 'Master\'s thesis'
kpeter@743
   317
            elif entrytype == 'phdthesis':
kpeter@743
   318
                entrycont['type'] = 'PhD thesis'
kpeter@743
   319
kpeter@743
   320
            for eline in entrycont:
kpeter@743
   321
                if eline != '':
kpeter@743
   322
                    eline = latexreplacements(eline)
kpeter@743
   323
kpeter@743
   324
            if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@743
   325
                entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
kpeter@743
   326
kpeter@743
   327
            if entrycont.has_key('author') and (entrycont['author'] != ''):
kpeter@743
   328
                entry.append(entrycont['author']['text'] + '.')
kpeter@743
   329
            if entrycont.has_key('title') and (entrycont['title'] != ''):
kpeter@743
   330
                entry.append(entrycont['title'] + '.')
kpeter@743
   331
            if entrycont.has_key('journal') and (entrycont['journal'] != ''):
kpeter@743
   332
                entry.append(entrycont['journal'] + ',')
kpeter@743
   333
            if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
kpeter@743
   334
                entry.append('In ' + entrycont['booktitle'] + ',')
kpeter@743
   335
            if entrycont.has_key('type') and (entrycont['type'] != ''):
kpeter@743
   336
                eline = entrycont['type']
kpeter@743
   337
                if entrycont.has_key('number') and (entrycont['number'] != ''):
kpeter@743
   338
                    eline += ' ' + entrycont['number']
kpeter@743
   339
                eline += ','
kpeter@743
   340
                entry.append(eline)
kpeter@743
   341
            if entrycont.has_key('institution') and (entrycont['institution'] != ''):
kpeter@743
   342
                entry.append(entrycont['institution'] + ',')
kpeter@743
   343
            if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
kpeter@743
   344
                entry.append(entrycont['publisher'] + ',')
kpeter@743
   345
            if entrycont.has_key('school') and (entrycont['school'] != ''):
kpeter@743
   346
                entry.append(entrycont['school'] + ',')
kpeter@743
   347
            if entrycont.has_key('address') and (entrycont['address'] != ''):
kpeter@743
   348
                entry.append(entrycont['address'] + ',')
kpeter@743
   349
            if entrycont.has_key('edition') and (entrycont['edition'] != ''):
kpeter@743
   350
                entry.append(entrycont['edition'] + ' edition,')
kpeter@743
   351
            if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
kpeter@743
   352
                entry.append(entrycont['howpublished'] + ',')
kpeter@743
   353
            if entrycont.has_key('volume') and (entrycont['volume'] != ''):
kpeter@743
   354
                eline = entrycont['volume'];
kpeter@743
   355
                if entrycont.has_key('number') and (entrycont['number'] != ''):
kpeter@743
   356
                    eline += '(' + entrycont['number'] + ')'
kpeter@743
   357
                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@743
   358
                    eline += ':' + entrycont['pages']
kpeter@743
   359
                eline += ','
kpeter@743
   360
                entry.append(eline)
kpeter@743
   361
            else:
kpeter@743
   362
                if entrycont.has_key('pages') and (entrycont['pages'] != ''):
kpeter@743
   363
                    entry.append('pages ' + entrycont['pages'] + ',')
kpeter@743
   364
            if entrycont.has_key('year') and (entrycont['year'] != ''):
kpeter@743
   365
                if entrycont.has_key('month') and (entrycont['month'] != ''):
kpeter@743
   366
                    entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
kpeter@743
   367
                else:
kpeter@743
   368
                    entry.append(entrycont['year'] + '.')
kpeter@743
   369
            if entrycont.has_key('note') and (entrycont['note'] != ''):
kpeter@743
   370
                entry.append(entrycont['note'] + '.')
kpeter@754
   371
            if entrycont.has_key('url') and (entrycont['url'] != ''):
kpeter@754
   372
                entry.append(entrycont['url'] + '.')
kpeter@743
   373
kpeter@743
   374
            # generate keys for sorting and for the output
kpeter@743
   375
            sortkey = ''
kpeter@743
   376
            bibkey = ''
kpeter@743
   377
            if entrycont.has_key('author'):
kpeter@743
   378
                for author in entrycont['author']['list']:
kpeter@743
   379
                    sortkey += copychars(author, author.rfind(' ')+1, len(author))
kpeter@743
   380
                bibkey = entrycont['author']['abbrev']
kpeter@743
   381
            else:
kpeter@743
   382
                bibkey = 'x'
kpeter@743
   383
            if entrycont.has_key('year'):
kpeter@743
   384
                sortkey += entrycont['year']
kpeter@743
   385
                bibkey += entrycont['year'][-2:]
kpeter@743
   386
            if entrycont.has_key('title'):
kpeter@743
   387
                sortkey += entrycont['title']
kpeter@743
   388
            if entrycont.has_key('key'):
kpeter@743
   389
                sortkey = entrycont['key'] + sortkey
kpeter@743
   390
                bibkey = entrycont['key']
kpeter@743
   391
            entry.insert(0, sortkey)
kpeter@743
   392
            entry.insert(1, bibkey)
kpeter@745
   393
            entry.insert(2, entryid)
kpeter@743
   394
           
kpeter@743
   395
            # add the entry to the file contents
kpeter@743
   396
            filecont.append(entry)
kpeter@743
   397
kpeter@743
   398
        else:
kpeter@743
   399
            # field, publication info
kpeter@743
   400
            field = ''
kpeter@743
   401
            data = ''
kpeter@743
   402
            
kpeter@743
   403
            # field = {data} entries
kpeter@743
   404
            if bracedata_rex.match(line):
kpeter@743
   405
                field = bracefield_rex.sub('\g<1>', line)
kpeter@743
   406
                field = string.lower(field)
kpeter@743
   407
                data =  bracedata_rex.sub('\g<2>', line)
kpeter@743
   408
kpeter@743
   409
            # field = "data" entries
kpeter@743
   410
            elif quotedata_rex.match(line):
kpeter@743
   411
                field = quotefield_rex.sub('\g<1>', line)
kpeter@743
   412
                field = string.lower(field)
kpeter@743
   413
                data =  quotedata_rex.sub('\g<2>', line)
kpeter@743
   414
kpeter@743
   415
            # field = data entries
kpeter@743
   416
            elif data_rex.match(line):
kpeter@743
   417
                field = field_rex.sub('\g<1>', line)
kpeter@743
   418
                field = string.lower(field)
kpeter@743
   419
                data =  data_rex.sub('\g<2>', line)
kpeter@754
   420
kpeter@754
   421
            if field == 'url':
kpeter@754
   422
                data = '\\url{' + data.strip() + '}'
kpeter@743
   423
            
kpeter@743
   424
            if field in ('author', 'editor'):
kpeter@743
   425
                entrycont[field] = bibtexauthor(data)
kpeter@743
   426
                line = ''
kpeter@743
   427
            elif field == 'title':
kpeter@743
   428
                line = bibtextitle(data, entrytype)
kpeter@743
   429
            elif field != '':
kpeter@743
   430
                line = removebraces(transformurls(data.strip()))
kpeter@743
   431
kpeter@743
   432
            if line != '':
kpeter@743
   433
                line = latexreplacements(line)
kpeter@743
   434
                entrycont[field] = line
kpeter@743
   435
kpeter@743
   436
kpeter@743
   437
    # sort entries
kpeter@743
   438
    filecont.sort(entry_cmp)
kpeter@743
   439
    
kpeter@743
   440
    # count the bibtex keys
kpeter@743
   441
    keytable = {}
kpeter@743
   442
    counttable = {}
kpeter@743
   443
    for entry in filecont:
kpeter@743
   444
        bibkey = entry[1]
kpeter@743
   445
        if not keytable.has_key(bibkey):
kpeter@743
   446
            keytable[bibkey] = 1
kpeter@743
   447
        else:
kpeter@743
   448
            keytable[bibkey] += 1
kpeter@743
   449
kpeter@743
   450
    for bibkey in keytable.keys():
kpeter@743
   451
        counttable[bibkey] = 0
kpeter@743
   452
    
kpeter@743
   453
    # generate output
kpeter@743
   454
    for entry in filecont:
kpeter@743
   455
        # generate output key form the bibtex key
kpeter@743
   456
        bibkey = entry[1]
kpeter@745
   457
        entryid = entry[2]
kpeter@743
   458
        if keytable[bibkey] == 1:
kpeter@743
   459
            outkey = bibkey
kpeter@743
   460
        else:
kpeter@743
   461
            outkey = bibkey + chr(97 + counttable[bibkey])
kpeter@743
   462
        counttable[bibkey] += 1
kpeter@743
   463
        
kpeter@743
   464
        # append the entry code to the output
kpeter@745
   465
        file.append('\\section ' + entryid + ' [' + outkey + ']')
kpeter@745
   466
        file.append('<div style="' + divstyle + '">')
kpeter@745
   467
        for line in entry[3:]:
kpeter@743
   468
            file.append(line)
kpeter@745
   469
        file.append('</div>')
kpeter@743
   470
        file.append('')
kpeter@743
   471
kpeter@743
   472
    return file
kpeter@743
   473
kpeter@743
   474
kpeter@743
   475
#
kpeter@743
   476
# return 1 iff abbr is in line but not inside braces or quotes
kpeter@743
   477
# assumes that abbr appears only once on the line (out of braces and quotes)
kpeter@743
   478
#
kpeter@743
   479
def verify_out_of_braces(line, abbr):
kpeter@743
   480
kpeter@743
   481
    phrase_split = delimiter_rex.split(line)
kpeter@743
   482
kpeter@743
   483
    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
kpeter@743
   484
kpeter@743
   485
    open_brace = 0
kpeter@743
   486
    open_quote = 0
kpeter@743
   487
kpeter@743
   488
    for phrase in phrase_split:
kpeter@743
   489
        if phrase == "{":
kpeter@743
   490
            open_brace = open_brace + 1
kpeter@743
   491
        elif phrase == "}":
kpeter@743
   492
            open_brace = open_brace - 1
kpeter@743
   493
        elif phrase == '"':
kpeter@743
   494
            if open_quote == 1:
kpeter@743
   495
                open_quote = 0
kpeter@743
   496
            else:
kpeter@743
   497
                open_quote = 1
kpeter@743
   498
        elif abbr_rex.search(phrase):
kpeter@743
   499
            if open_brace == 0 and open_quote == 0:
kpeter@743
   500
                return 1
kpeter@743
   501
kpeter@743
   502
    return 0
kpeter@743
   503
kpeter@743
   504
kpeter@743
   505
#
kpeter@743
   506
# a line in the form phrase1 # phrase2 # ... # phrasen
kpeter@743
   507
# is returned as phrase1 phrase2 ... phrasen
kpeter@743
   508
# with the correct punctuation
kpeter@743
   509
# Bug: Doesn't always work with multiple abbreviations plugged in
kpeter@743
   510
#
kpeter@743
   511
def concat_line(line):
kpeter@743
   512
    # only look at part after equals
kpeter@743
   513
    field = field_rex.sub('\g<1>',line)
kpeter@743
   514
    rest = field_rex.sub('\g<2>',line)
kpeter@743
   515
kpeter@743
   516
    concat_line = field + ' ='
kpeter@743
   517
kpeter@743
   518
    pound_split = concatsplit_rex.split(rest)
kpeter@743
   519
kpeter@743
   520
    phrase_count = 0
kpeter@743
   521
    length = len(pound_split)
kpeter@743
   522
kpeter@743
   523
    for phrase in pound_split:
kpeter@743
   524
        phrase = phrase.strip()
kpeter@743
   525
        if phrase_count != 0:
kpeter@743
   526
            if phrase.startswith('"') or phrase.startswith('{'):
kpeter@743
   527
                phrase = phrase[1:]
kpeter@743
   528
        elif phrase.startswith('"'):
kpeter@743
   529
            phrase = phrase.replace('"','{',1)
kpeter@743
   530
kpeter@743
   531
        if phrase_count != length-1:
kpeter@743
   532
            if phrase.endswith('"') or phrase.endswith('}'):
kpeter@743
   533
                phrase = phrase[:-1]
kpeter@743
   534
        else:
kpeter@743
   535
            if phrase.endswith('"'):
kpeter@743
   536
                phrase = phrase[:-1]
kpeter@743
   537
                phrase = phrase + "}"
kpeter@743
   538
            elif phrase.endswith('",'):
kpeter@743
   539
                phrase = phrase[:-2]
kpeter@743
   540
                phrase = phrase + "},"
kpeter@743
   541
kpeter@743
   542
        # if phrase did have \#, add the \# back
kpeter@743
   543
        if phrase.endswith('\\'):
kpeter@743
   544
            phrase = phrase + "#"
kpeter@743
   545
        concat_line = concat_line + ' ' + phrase
kpeter@743
   546
kpeter@743
   547
        phrase_count = phrase_count + 1
kpeter@743
   548
kpeter@743
   549
    return concat_line
kpeter@743
   550
kpeter@743
   551
kpeter@743
   552
#
kpeter@743
   553
# substitute abbreviations into filecont
kpeter@743
   554
# @param filecont_source - string of data from file
kpeter@743
   555
#
kpeter@743
   556
def bibtex_replace_abbreviations(filecont_source):
kpeter@743
   557
    filecont = filecont_source.splitlines()
kpeter@743
   558
kpeter@743
   559
    #  These are defined in bibtex, so we'll define them too
kpeter@743
   560
    abbr_list = ['jan','feb','mar','apr','may','jun',
kpeter@743
   561
                 'jul','aug','sep','oct','nov','dec']
kpeter@743
   562
    value_list = ['January','February','March','April',
kpeter@743
   563
                  'May','June','July','August','September',
kpeter@743
   564
                  'October','November','December']
kpeter@743
   565
kpeter@743
   566
    abbr_rex = []
kpeter@743
   567
    total_abbr_count = 0
kpeter@743
   568
kpeter@743
   569
    front = '\\b'
kpeter@743
   570
    back = '(,?)\\b'
kpeter@743
   571
kpeter@743
   572
    for x in abbr_list:
kpeter@743
   573
        abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
kpeter@743
   574
        total_abbr_count = total_abbr_count + 1
kpeter@743
   575
kpeter@743
   576
kpeter@743
   577
    abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
kpeter@743
   578
                             re.I)
kpeter@743
   579
kpeter@743
   580
    comment_rex = re.compile('@comment\s*{',re.I)
kpeter@743
   581
    preamble_rex = re.compile('@preamble\s*{',re.I)
kpeter@743
   582
kpeter@743
   583
    waiting_for_end_string = 0
kpeter@743
   584
    i = 0
kpeter@743
   585
    filecont2 = ''
kpeter@743
   586
kpeter@743
   587
    for line in filecont:
kpeter@743
   588
        if line == ' ' or line == '':
kpeter@743
   589
            continue
kpeter@743
   590
kpeter@743
   591
        if waiting_for_end_string:
kpeter@743
   592
            if re.search('}',line):
kpeter@743
   593
                waiting_for_end_string = 0
kpeter@743
   594
                continue
kpeter@743
   595
kpeter@743
   596
        if abbrdef_rex.search(line):
kpeter@743
   597
            abbr = abbrdef_rex.sub('\g<1>', line)
kpeter@743
   598
kpeter@743
   599
            if abbr_list.count(abbr) == 0:
kpeter@743
   600
                val = abbrdef_rex.sub('\g<2>', line)
kpeter@743
   601
                abbr_list.append(abbr)
kpeter@743
   602
                value_list.append(string.strip(val))
kpeter@743
   603
                abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
kpeter@743
   604
                total_abbr_count = total_abbr_count + 1
kpeter@743
   605
            waiting_for_end_string = 1
kpeter@743
   606
            continue
kpeter@743
   607
kpeter@743
   608
        if comment_rex.search(line):
kpeter@743
   609
            waiting_for_end_string = 1
kpeter@743
   610
            continue
kpeter@743
   611
kpeter@743
   612
        if preamble_rex.search(line):
kpeter@743
   613
            waiting_for_end_string = 1
kpeter@743
   614
            continue
kpeter@743
   615
kpeter@743
   616
kpeter@743
   617
        # replace subsequent abbreviations with the value
kpeter@743
   618
        abbr_count = 0
kpeter@743
   619
kpeter@743
   620
        for x in abbr_list:
kpeter@743
   621
kpeter@743
   622
            if abbr_rex[abbr_count].search(line):
kpeter@743
   623
                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
kpeter@743
   624
                    line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
kpeter@743
   625
                # Check for # concatenations
kpeter@743
   626
                if concatsplit_rex.search(line):
kpeter@743
   627
                    line = concat_line(line)
kpeter@743
   628
            abbr_count = abbr_count + 1
kpeter@743
   629
kpeter@743
   630
kpeter@743
   631
        filecont2 = filecont2 + line + '\n'
kpeter@743
   632
        i = i+1
kpeter@743
   633
kpeter@743
   634
kpeter@743
   635
    # Do one final pass over file
kpeter@743
   636
kpeter@743
   637
    # make sure that didn't end up with {" or }" after the substitution
kpeter@743
   638
    filecont2 = filecont2.replace('{"','{{')
kpeter@743
   639
    filecont2 = filecont2.replace('"}','}}')
kpeter@743
   640
kpeter@743
   641
    afterquotevalue_rex = re.compile('"\s*,\s*')
kpeter@743
   642
    afterbrace_rex = re.compile('"\s*}')
kpeter@743
   643
    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
kpeter@743
   644
kpeter@743
   645
    # add new lines to data that changed because of abbreviation substitutions
kpeter@743
   646
    filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
kpeter@743
   647
    filecont2 = afterbrace_rex.sub('"\n}', filecont2)
kpeter@743
   648
    filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
kpeter@743
   649
kpeter@743
   650
    return filecont2
kpeter@743
   651
kpeter@743
   652
#
kpeter@743
   653
# convert @type( ... ) to @type{ ... }
kpeter@743
   654
#
kpeter@743
   655
def no_outer_parens(filecont):
kpeter@743
   656
kpeter@743
   657
    # do checking for open parens
kpeter@743
   658
    # will convert to braces
kpeter@743
   659
    paren_split = re.split('([(){}])',filecont)
kpeter@743
   660
kpeter@743
   661
    open_paren_count = 0
kpeter@743
   662
    open_type = 0
kpeter@743
   663
    look_next = 0
kpeter@743
   664
kpeter@743
   665
    # rebuild filecont
kpeter@743
   666
    filecont = ''
kpeter@743
   667
kpeter@743
   668
    at_rex = re.compile('@\w*')
kpeter@743
   669
kpeter@743
   670
    for phrase in paren_split:
kpeter@743
   671
        if look_next == 1:
kpeter@743
   672
            if phrase == '(':
kpeter@743
   673
                phrase = '{'
kpeter@743
   674
                open_paren_count = open_paren_count + 1
kpeter@743
   675
            else:
kpeter@743
   676
                open_type = 0
kpeter@743
   677
            look_next = 0
kpeter@743
   678
kpeter@743
   679
        if phrase == '(':
kpeter@743
   680
            open_paren_count = open_paren_count + 1
kpeter@743
   681
kpeter@743
   682
        elif phrase == ')':
kpeter@743
   683
            open_paren_count = open_paren_count - 1
kpeter@743
   684
            if open_type == 1 and open_paren_count == 0:
kpeter@743
   685
                phrase = '}'
kpeter@743
   686
                open_type = 0
kpeter@743
   687
kpeter@743
   688
        elif at_rex.search( phrase ):
kpeter@743
   689
            open_type = 1
kpeter@743
   690
            look_next = 1
kpeter@743
   691
kpeter@743
   692
        filecont = filecont + phrase
kpeter@743
   693
kpeter@743
   694
    return filecont
kpeter@743
   695
kpeter@743
   696
kpeter@743
   697
#
kpeter@743
   698
# make all whitespace into just one space
kpeter@743
   699
# format the bibtex file into a usable form.
kpeter@743
   700
#
kpeter@743
   701
def bibtexwasher(filecont_source):
kpeter@743
   702
kpeter@743
   703
    space_rex = re.compile('\s+')
kpeter@743
   704
    comment_rex = re.compile('\s*%')
kpeter@743
   705
kpeter@743
   706
    filecont = []
kpeter@743
   707
kpeter@743
   708
    # remove trailing and excessive whitespace
kpeter@743
   709
    # ignore comments
kpeter@743
   710
    for line in filecont_source:
kpeter@743
   711
        line = string.strip(line)
kpeter@743
   712
        line = space_rex.sub(' ', line)
kpeter@743
   713
        # ignore comments
kpeter@743
   714
        if not comment_rex.match(line) and line != '':
kpeter@743
   715
            filecont.append(' '+ line)
kpeter@743
   716
kpeter@743
   717
    filecont = string.join(filecont, '')
kpeter@743
   718
kpeter@743
   719
    # the file is in one long string
kpeter@743
   720
kpeter@743
   721
    filecont = no_outer_parens(filecont)
kpeter@743
   722
kpeter@743
   723
    #
kpeter@743
   724
    # split lines according to preferred syntax scheme
kpeter@743
   725
    #
kpeter@743
   726
    filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
kpeter@743
   727
kpeter@743
   728
    # add new lines after commas that are after values
kpeter@743
   729
    filecont = re.sub('"\s*,', '",\n', filecont)
kpeter@743
   730
    filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
kpeter@743
   731
    filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
kpeter@743
   732
                          '\n\n\g<1>\g<2>,\n', filecont)
kpeter@743
   733
kpeter@743
   734
    # add new lines after }
kpeter@743
   735
    filecont = re.sub('"\s*}','"\n}\n', filecont)
kpeter@743
   736
    filecont = re.sub('}\s*,','},\n', filecont)
kpeter@743
   737
kpeter@743
   738
kpeter@743
   739
    filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
kpeter@743
   740
kpeter@743
   741
    # character encoding, reserved latex characters
kpeter@743
   742
    filecont = re.sub('{\\\&}', '&', filecont)
kpeter@743
   743
    filecont = re.sub('\\\&', '&', filecont)
kpeter@743
   744
kpeter@743
   745
    # do checking for open braces to get format correct
kpeter@743
   746
    open_brace_count = 0
kpeter@743
   747
    brace_split = re.split('([{}])',filecont)
kpeter@743
   748
kpeter@743
   749
    # rebuild filecont
kpeter@743
   750
    filecont = ''
kpeter@743
   751
kpeter@743
   752
    for phrase in brace_split:
kpeter@743
   753
        if phrase == '{':
kpeter@743
   754
            open_brace_count = open_brace_count + 1
kpeter@743
   755
        elif phrase == '}':
kpeter@743
   756
            open_brace_count = open_brace_count - 1
kpeter@743
   757
            if open_brace_count == 0:
kpeter@743
   758
                filecont = filecont + '\n'
kpeter@743
   759
kpeter@743
   760
        filecont = filecont + phrase
kpeter@743
   761
kpeter@743
   762
    filecont2 = bibtex_replace_abbreviations(filecont)
kpeter@743
   763
kpeter@743
   764
    # gather
kpeter@743
   765
    filecont = filecont2.splitlines()
kpeter@743
   766
    i=0
kpeter@743
   767
    j=0         # count the number of blank lines
kpeter@743
   768
    for line in filecont:
kpeter@743
   769
        # ignore blank lines
kpeter@743
   770
        if line == '' or line == ' ':
kpeter@743
   771
            j = j+1
kpeter@743
   772
            continue
kpeter@743
   773
        filecont[i] = line + '\n'
kpeter@743
   774
        i = i+1
kpeter@743
   775
kpeter@743
   776
    # get rid of the extra stuff at the end of the array
kpeter@743
   777
    # (The extra stuff are duplicates that are in the array because
kpeter@743
   778
    # blank lines were removed.)
kpeter@743
   779
    length = len( filecont)
kpeter@743
   780
    filecont[length-j:length] = []
kpeter@743
   781
kpeter@743
   782
    return filecont
kpeter@743
   783
kpeter@743
   784
kpeter@743
   785
def filehandler(filepath):
kpeter@743
   786
    try:
kpeter@743
   787
        fd = open(filepath, 'r')
kpeter@743
   788
        filecont_source = fd.readlines()
kpeter@743
   789
        fd.close()
kpeter@743
   790
    except:
kpeter@743
   791
        print 'Could not open file:', filepath
kpeter@743
   792
    washeddata = bibtexwasher(filecont_source)
kpeter@743
   793
    outdata = bibtexdecoder(washeddata)
kpeter@743
   794
    print '/**'
kpeter@743
   795
    print '\page references References'
kpeter@743
   796
    print
kpeter@743
   797
    for line in outdata:
kpeter@743
   798
        print line
kpeter@743
   799
    print '*/'
kpeter@743
   800
kpeter@743
   801
kpeter@743
   802
# main program
kpeter@743
   803
kpeter@743
   804
def main():
kpeter@743
   805
    import sys
kpeter@743
   806
    if sys.argv[1:]:
kpeter@743
   807
        filepath = sys.argv[1]
kpeter@743
   808
    else:
kpeter@743
   809
        print "No input file"
kpeter@743
   810
        sys.exit()
kpeter@743
   811
    filehandler(filepath)
kpeter@743
   812
kpeter@743
   813
if __name__ == "__main__": main()
kpeter@743
   814
kpeter@743
   815
kpeter@743
   816
# end python script