lemon-1.3: scripts/bib2dox.py@9312d6c89d02

     1 #! /usr/bin/env python

     2 """

     3   BibTeX to Doxygen converter

     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox

     6   This file is a part of LEMON, a generic C++ optimization library.

     8   **********************************************************************

    10   This code is the modification of the BibTeX to XML converter

    11   by Vidar Bronken Gundersen et al.

    12   See the original copyright notices below.

    14   **********************************************************************

    16   Decoder for bibliographic data, BibTeX

    17   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml

    19   v.8

    20   (c)2002-06-23 Vidar Bronken Gundersen

    21   http://bibtexml.sf.net/

    22   Reuse approved as long as this notification is kept.

    23   Licence: GPL.

    25   Contributions/thanks to:

    26   Egon Willighagen, http://sf.net/projects/jreferences/

    27   Richard Mahoney (for providing a test case)

    29   Editted by Sara Sprenkle to be more robust and handle more bibtex features.

    30   (c) 2003-01-15

    32   1.  Changed bibtex: tags to bibxml: tags.

    33   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"

    34   3.  Allow spaces between @type and first {

    35   4.  "author" fields with multiple authors split by " and "

    36       are put in separate xml "bibxml:author" tags.

    37   5.  Option for Titles: words are capitalized

    38       only if first letter in title or capitalized inside braces

    39   6.  Removes braces from within field values

    40   7.  Ignores comments in bibtex file (including @comment{ or % )

    41   8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'

    42   9.  Handles bibtex @string abbreviations

    43         --> includes bibtex's default abbreviations for months

    44         --> does concatenation of abbr # " more " and " more " # abbr

    45   10. Handles @type( ... ) or @type{ ... }

    46   11. The keywords field is split on , or ; and put into separate xml

    47       "bibxml:keywords" tags

    48   12. Ignores @preamble

    50   Known Limitations

    51   1.  Does not transform Latex encoding like math mode and special

    52       latex symbols.

    53   2.  Does not parse author fields into first and last names.

    54       E.g., It does not do anything special to an author whose name is

    55       in the form LAST_NAME, FIRST_NAME

    56       In "author" tag, will show up as

    57       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>

    58   3.  Does not handle "crossref" fields other than to print

    59       <bibxml:crossref>...</bibxml:crossref>

    60   4.  Does not inform user of the input's format errors.  You just won't

    61       be able to transform the file later with XSL

    63   You will have to manually edit the XML output if you need to handle

    64   these (and unknown) limitations.

    66 """

    68 import string, re

    70 # set of valid name characters

    71 valid_name_chars = '[\w\-:]'

    73 #

    74 # define global regular expression variables

    75 #

    76 author_rex = re.compile('\s+and\s+')

    77 rembraces_rex = re.compile('[{}]')

    78 capitalize_rex = re.compile('({[^}]*})')

    80 # used by bibtexkeywords(data)

    81 keywords_rex = re.compile('[,;]')

    83 # used by concat_line(line)

    84 concatsplit_rex = re.compile('\s*#\s*')

    86 # split on {, }, or " in verify_out_of_braces

    87 delimiter_rex = re.compile('([{}"])',re.I)

    89 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

    90 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')

    92 url_rex = re.compile('\\\url\{([^}]*)\}')

    94 #

    95 # styles for html formatting

    96 #

    97 divstyle = 'margin-top: -4ex; margin-left: 8em;'

    99 #

   100 # return the string parameter without braces

   101 #

   102 def transformurls(str):

   103     return url_rex.sub(r'<a href="\1">\1</a>', str)

   105 #

   106 # return the string parameter without braces

   107 #

   108 def removebraces(str):

   109     return rembraces_rex.sub('', str)

   111 #

   112 # latex-specific replacements

   113 # (do this after braces were removed)

   114 #

   115 def latexreplacements(line):

   116     line = string.replace(line, '~', '&nbsp;')

   117     line = string.replace(line, '\\\'a', '&aacute;')

   118     line = string.replace(line, '\\"a', '&auml;')

   119     line = string.replace(line, '\\\'e', '&eacute;')

   120     line = string.replace(line, '\\"e', '&euml;')

   121     line = string.replace(line, '\\\'i', '&iacute;')

   122     line = string.replace(line, '\\"i', '&iuml;')

   123     line = string.replace(line, '\\\'o', '&oacute;')

   124     line = string.replace(line, '\\"o', '&ouml;')

   125     line = string.replace(line, '\\\'u', '&uacute;')

   126     line = string.replace(line, '\\"u', '&uuml;')

   127     line = string.replace(line, '\\H o', '&otilde;')

   128     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist

   129     line = string.replace(line, '\\\'A', '&Aacute;')

   130     line = string.replace(line, '\\"A', '&Auml;')

   131     line = string.replace(line, '\\\'E', '&Eacute;')

   132     line = string.replace(line, '\\"E', '&Euml;')

   133     line = string.replace(line, '\\\'I', '&Iacute;')

   134     line = string.replace(line, '\\"I', '&Iuml;')

   135     line = string.replace(line, '\\\'O', '&Oacute;')

   136     line = string.replace(line, '\\"O', '&Ouml;')

   137     line = string.replace(line, '\\\'U', '&Uacute;')

   138     line = string.replace(line, '\\"U', '&Uuml;')

   139     line = string.replace(line, '\\H O', '&Otilde;')

   140     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist

   142     return line

   144 #

   145 # copy characters form a string decoding html expressions (&xyz;)

   146 #

   147 def copychars(str, ifrom, count):

   148     result = ''

   149     i = ifrom

   150     c = 0

   151     html_spec = False

   152     while (i < len(str)) and (c < count):

   153         if str[i] == '&':

   154             html_spec = True;

   155             if i+1 < len(str):

   156                 result += str[i+1]

   157             c += 1

   158             i += 2

   159         else:

   160             if not html_spec:

   161                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \

   162                    ((str[i] >= 'a') and (str[i] <= 'z')):

   163                     result += str[i]

   164                     c += 1

   165             elif str[i] == ';':

   166                 html_spec = False;

   167             i += 1

   169     return result

   172 #

   173 # Handle a list of authors (separated by 'and').

   174 # It gives back an array of the follwing values:

   175 #  - num: the number of authors,

   176 #  - list: the list of the author names,

   177 #  - text: the bibtex text (separated by commas and/or 'and')

   178 #  - abbrev: abbreviation that can be used for indicate the

   179 #    bibliography entries

   180 #

   181 def bibtexauthor(data):

   182     result = {}

   183     bibtex = ''

   184     result['list'] = author_rex.split(data)

   185     result['num'] = len(result['list'])

   186     for i, author in enumerate(result['list']):

   187         # general transformations

   188         author = latexreplacements(removebraces(author.strip()))

   189         # transform "Xyz, A. B." to "A. B. Xyz"

   190         pos = author.find(',')

   191         if pos != -1:

   192             author = author[pos+1:].strip() + ' ' + author[:pos].strip()

   193         result['list'][i] = author

   194         bibtex += author + '#'

   195     bibtex = bibtex[:-1]

   196     if result['num'] > 1:

   197         ix = bibtex.rfind('#')

   198         if result['num'] == 2:

   199             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]

   200         else:

   201             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]

   202     bibtex = bibtex.replace('#', ', ')

   203     result['text'] = bibtex

   205     result['abbrev'] = ''

   206     for author in result['list']:

   207         pos = author.rfind(' ') + 1

   208         count = 1

   209         if result['num'] == 1:

   210             count = 3

   211         result['abbrev'] += copychars(author, pos, count)

   213     return result

   216 #

   217 # data = title string

   218 # @return the capitalized title (first letter is capitalized), rest are capitalized

   219 # only if capitalized inside braces

   220 #

   221 def capitalizetitle(data):

   222     title_list = capitalize_rex.split(data)

   223     title = ''

   224     count = 0

   225     for phrase in title_list:

   226          check = string.lstrip(phrase)

   228          # keep phrase's capitalization the same

   229          if check.find('{') == 0:

   230               title += removebraces(phrase)

   231          else:

   232          # first word --> capitalize first letter (after spaces)

   233               if count == 0:

   234                   title += check.capitalize()

   235               else:

   236                   title += phrase.lower()

   237          count = count + 1

   239     return title

   242 #

   243 # @return the bibtex for the title

   244 # @param data --> title string

   245 # braces are removed from title

   246 #

   247 def bibtextitle(data, entrytype):

   248     if entrytype in ('book', 'inbook'):

   249         title = removebraces(data.strip())

   250     else:

   251         title = removebraces(capitalizetitle(data.strip()))

   252     bibtex = title

   253     return bibtex

   256 #

   257 # function to compare entry lists

   258 #

   259 def entry_cmp(x, y):

   260     return cmp(x[0], y[0])

   263 #

   264 # print the XML for the transformed "filecont_source"

   265 #

   266 def bibtexdecoder(filecont_source):

   267     filecont = []

   268     file = []

   270     # want @<alphanumeric chars><spaces>{<spaces><any chars>,

   271     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')

   272     endtype_rex = re.compile('}\s*$')

   273     endtag_rex = re.compile('^\s*}\s*$')

   275     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

   276     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')

   278     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

   279     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')

   281     for line in filecont_source:

   282         line = line[:-1]

   284         # encode character entities

   285         line = string.replace(line, '&', '&amp;')

   286         line = string.replace(line, '<', '&lt;')

   287         line = string.replace(line, '>', '&gt;')

   289         # start entry: publication type (store for later use)

   290         if pubtype_rex.match(line):

   291         # want @<alphanumeric chars><spaces>{<spaces><any chars>,

   292             entrycont = {}

   293             entry = []

   294             entrytype = pubtype_rex.sub('\g<1>',line)

   295             entrytype = string.lower(entrytype)

   296             entryid   = pubtype_rex.sub('\g<2>', line)

   298         # end entry if just a }

   299         elif endtype_rex.match(line):

   300             # generate doxygen code for the entry

   302             # enty type related formattings

   303             if entrytype in ('book', 'inbook'):

   304                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'

   305                 if not entrycont.has_key('author'):

   306                     entrycont['author'] = entrycont['editor']

   307                     entrycont['author']['text'] += ', editors'

   308             elif entrytype == 'article':

   309                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'

   310             elif entrytype in ('inproceedings', 'incollection', 'conference'):

   311                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'

   312             elif entrytype == 'techreport':

   313                 if not entrycont.has_key('type'):

   314                     entrycont['type'] = 'Technical report'

   315             elif entrytype == 'mastersthesis':

   316                 entrycont['type'] = 'Master\'s thesis'

   317             elif entrytype == 'phdthesis':

   318                 entrycont['type'] = 'PhD thesis'

   320             for eline in entrycont:

   321                 if eline != '':

   322                     eline = latexreplacements(eline)

   324             if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   325                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')

   327             if entrycont.has_key('author') and (entrycont['author'] != ''):

   328                 entry.append(entrycont['author']['text'] + '.')

   329             if entrycont.has_key('title') and (entrycont['title'] != ''):

   330                 entry.append(entrycont['title'] + '.')

   331             if entrycont.has_key('journal') and (entrycont['journal'] != ''):

   332                 entry.append(entrycont['journal'] + ',')

   333             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):

   334                 entry.append('In ' + entrycont['booktitle'] + ',')

   335             if entrycont.has_key('type') and (entrycont['type'] != ''):

   336                 eline = entrycont['type']

   337                 if entrycont.has_key('number') and (entrycont['number'] != ''):

   338                     eline += ' ' + entrycont['number']

   339                 eline += ','

   340                 entry.append(eline)

   341             if entrycont.has_key('institution') and (entrycont['institution'] != ''):

   342                 entry.append(entrycont['institution'] + ',')

   343             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):

   344                 entry.append(entrycont['publisher'] + ',')

   345             if entrycont.has_key('school') and (entrycont['school'] != ''):

   346                 entry.append(entrycont['school'] + ',')

   347             if entrycont.has_key('address') and (entrycont['address'] != ''):

   348                 entry.append(entrycont['address'] + ',')

   349             if entrycont.has_key('edition') and (entrycont['edition'] != ''):

   350                 entry.append(entrycont['edition'] + ' edition,')

   351             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):

   352                 entry.append(entrycont['howpublished'] + ',')

   353             if entrycont.has_key('volume') and (entrycont['volume'] != ''):

   354                 eline = entrycont['volume'];

   355                 if entrycont.has_key('number') and (entrycont['number'] != ''):

   356                     eline += '(' + entrycont['number'] + ')'

   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   358                     eline += ':' + entrycont['pages']

   359                 eline += ','

   360                 entry.append(eline)

   361             else:

   362                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   363                     entry.append('pages ' + entrycont['pages'] + ',')

   364             if entrycont.has_key('year') and (entrycont['year'] != ''):

   365                 if entrycont.has_key('month') and (entrycont['month'] != ''):

   366                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')

   367                 else:

   368                     entry.append(entrycont['year'] + '.')

   369             if entrycont.has_key('note') and (entrycont['note'] != ''):

   370                 entry.append(entrycont['note'] + '.')

   371             if entrycont.has_key('url') and (entrycont['url'] != ''):

   372                 entry.append(entrycont['url'] + '.')

   374             # generate keys for sorting and for the output

   375             sortkey = ''

   376             bibkey = ''

   377             if entrycont.has_key('author'):

   378                 for author in entrycont['author']['list']:

   379                     sortkey += copychars(author, author.rfind(' ')+1, len(author))

   380                 bibkey = entrycont['author']['abbrev']

   381             else:

   382                 bibkey = 'x'

   383             if entrycont.has_key('year'):

   384                 sortkey += entrycont['year']

   385                 bibkey += entrycont['year'][-2:]

   386             if entrycont.has_key('title'):

   387                 sortkey += entrycont['title']

   388             if entrycont.has_key('key'):

   389                 sortkey = entrycont['key'] + sortkey

   390                 bibkey = entrycont['key']

   391             entry.insert(0, sortkey)

   392             entry.insert(1, bibkey)

   393             entry.insert(2, entryid)

   395             # add the entry to the file contents

   396             filecont.append(entry)

   398         else:

   399             # field, publication info

   400             field = ''

   401             data = ''

   403             # field = {data} entries

   404             if bracedata_rex.match(line):

   405                 field = bracefield_rex.sub('\g<1>', line)

   406                 field = string.lower(field)

   407                 data =  bracedata_rex.sub('\g<2>', line)

   409             # field = "data" entries

   410             elif quotedata_rex.match(line):

   411                 field = quotefield_rex.sub('\g<1>', line)

   412                 field = string.lower(field)

   413                 data =  quotedata_rex.sub('\g<2>', line)

   415             # field = data entries

   416             elif data_rex.match(line):

   417                 field = field_rex.sub('\g<1>', line)

   418                 field = string.lower(field)

   419                 data =  data_rex.sub('\g<2>', line)

   421             if field == 'url':

   422                 data = '\\url{' + data.strip() + '}'

   424             if field in ('author', 'editor'):

   425                 entrycont[field] = bibtexauthor(data)

   426                 line = ''

   427             elif field == 'title':

   428                 line = bibtextitle(data, entrytype)

   429             elif field != '':

   430                 line = removebraces(transformurls(data.strip()))

   432             if line != '':

   433                 line = latexreplacements(line)

   434                 entrycont[field] = line

   437     # sort entries

   438     filecont.sort(entry_cmp)

   440     # count the bibtex keys

   441     keytable = {}

   442     counttable = {}

   443     for entry in filecont:

   444         bibkey = entry[1]

   445         if not keytable.has_key(bibkey):

   446             keytable[bibkey] = 1

   447         else:

   448             keytable[bibkey] += 1

   450     for bibkey in keytable.keys():

   451         counttable[bibkey] = 0

   453     # generate output

   454     for entry in filecont:

   455         # generate output key form the bibtex key

   456         bibkey = entry[1]

   457         entryid = entry[2]

   458         if keytable[bibkey] == 1:

   459             outkey = bibkey

   460         else:

   461             outkey = bibkey + chr(97 + counttable[bibkey])

   462         counttable[bibkey] += 1

   464         # append the entry code to the output

   465         file.append('\\section ' + entryid + ' [' + outkey + ']')

   466         file.append('<div style="' + divstyle + '">')

   467         for line in entry[3:]:

   468             file.append(line)

   469         file.append('</div>')

   470         file.append('')

   472     return file

   475 #

   476 # return 1 iff abbr is in line but not inside braces or quotes

   477 # assumes that abbr appears only once on the line (out of braces and quotes)

   478 #

   479 def verify_out_of_braces(line, abbr):

   481     phrase_split = delimiter_rex.split(line)

   483     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)

   485     open_brace = 0

   486     open_quote = 0

   488     for phrase in phrase_split:

   489         if phrase == "{":

   490             open_brace = open_brace + 1

   491         elif phrase == "}":

   492             open_brace = open_brace - 1

   493         elif phrase == '"':

   494             if open_quote == 1:

   495                 open_quote = 0

   496             else:

   497                 open_quote = 1

   498         elif abbr_rex.search(phrase):

   499             if open_brace == 0 and open_quote == 0:

   500                 return 1

   502     return 0

   505 #

   506 # a line in the form phrase1 # phrase2 # ... # phrasen

   507 # is returned as phrase1 phrase2 ... phrasen

   508 # with the correct punctuation

   509 # Bug: Doesn't always work with multiple abbreviations plugged in

   510 #

   511 def concat_line(line):

   512     # only look at part after equals

   513     field = field_rex.sub('\g<1>',line)

   514     rest = field_rex.sub('\g<2>',line)

   516     concat_line = field + ' ='

   518     pound_split = concatsplit_rex.split(rest)

   520     phrase_count = 0

   521     length = len(pound_split)

   523     for phrase in pound_split:

   524         phrase = phrase.strip()

   525         if phrase_count != 0:

   526             if phrase.startswith('"') or phrase.startswith('{'):

   527                 phrase = phrase[1:]

   528         elif phrase.startswith('"'):

   529             phrase = phrase.replace('"','{',1)

   531         if phrase_count != length-1:

   532             if phrase.endswith('"') or phrase.endswith('}'):

   533                 phrase = phrase[:-1]

   534         else:

   535             if phrase.endswith('"'):

   536                 phrase = phrase[:-1]

   537                 phrase = phrase + "}"

   538             elif phrase.endswith('",'):

   539                 phrase = phrase[:-2]

   540                 phrase = phrase + "},"

   542         # if phrase did have \#, add the \# back

   543         if phrase.endswith('\\'):

   544             phrase = phrase + "#"

   545         concat_line = concat_line + ' ' + phrase

   547         phrase_count = phrase_count + 1

   549     return concat_line

   552 #

   553 # substitute abbreviations into filecont

   554 # @param filecont_source - string of data from file

   555 #

   556 def bibtex_replace_abbreviations(filecont_source):

   557     filecont = filecont_source.splitlines()

   559     #  These are defined in bibtex, so we'll define them too

   560     abbr_list = ['jan','feb','mar','apr','may','jun',

   561                  'jul','aug','sep','oct','nov','dec']

   562     value_list = ['January','February','March','April',

   563                   'May','June','July','August','September',

   564                   'October','November','December']

   566     abbr_rex = []

   567     total_abbr_count = 0

   569     front = '\\b'

   570     back = '(,?)\\b'

   572     for x in abbr_list:

   573         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )

   574         total_abbr_count = total_abbr_count + 1

   577     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',

   578                              re.I)

   580     comment_rex = re.compile('@comment\s*{',re.I)

   581     preamble_rex = re.compile('@preamble\s*{',re.I)

   583     waiting_for_end_string = 0

   584     i = 0

   585     filecont2 = ''

   587     for line in filecont:

   588         if line == ' ' or line == '':

   589             continue

   591         if waiting_for_end_string:

   592             if re.search('}',line):

   593                 waiting_for_end_string = 0

   594                 continue

   596         if abbrdef_rex.search(line):

   597             abbr = abbrdef_rex.sub('\g<1>', line)

   599             if abbr_list.count(abbr) == 0:

   600                 val = abbrdef_rex.sub('\g<2>', line)

   601                 abbr_list.append(abbr)

   602                 value_list.append(string.strip(val))

   603                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )

   604                 total_abbr_count = total_abbr_count + 1

   605             waiting_for_end_string = 1

   606             continue

   608         if comment_rex.search(line):

   609             waiting_for_end_string = 1

   610             continue

   612         if preamble_rex.search(line):

   613             waiting_for_end_string = 1

   614             continue

   617         # replace subsequent abbreviations with the value

   618         abbr_count = 0

   620         for x in abbr_list:

   622             if abbr_rex[abbr_count].search(line):

   623                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:

   624                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)

   625                 # Check for # concatenations

   626                 if concatsplit_rex.search(line):

   627                     line = concat_line(line)

   628             abbr_count = abbr_count + 1

   631         filecont2 = filecont2 + line + '\n'

   632         i = i+1

   635     # Do one final pass over file

   637     # make sure that didn't end up with {" or }" after the substitution

   638     filecont2 = filecont2.replace('{"','{{')

   639     filecont2 = filecont2.replace('"}','}}')

   641     afterquotevalue_rex = re.compile('"\s*,\s*')

   642     afterbrace_rex = re.compile('"\s*}')

   643     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')

   645     # add new lines to data that changed because of abbreviation substitutions

   646     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)

   647     filecont2 = afterbrace_rex.sub('"\n}', filecont2)

   648     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)

   650     return filecont2

   652 #

   653 # convert @type( ... ) to @type{ ... }

   654 #

   655 def no_outer_parens(filecont):

   657     # do checking for open parens

   658     # will convert to braces

   659     paren_split = re.split('([(){}])',filecont)

   661     open_paren_count = 0

   662     open_type = 0

   663     look_next = 0

   665     # rebuild filecont

   666     filecont = ''

   668     at_rex = re.compile('@\w*')

   670     for phrase in paren_split:

   671         if look_next == 1:

   672             if phrase == '(':

   673                 phrase = '{'

   674                 open_paren_count = open_paren_count + 1

   675             else:

   676                 open_type = 0

   677             look_next = 0

   679         if phrase == '(':

   680             open_paren_count = open_paren_count + 1

   682         elif phrase == ')':

   683             open_paren_count = open_paren_count - 1

   684             if open_type == 1 and open_paren_count == 0:

   685                 phrase = '}'

   686                 open_type = 0

   688         elif at_rex.search( phrase ):

   689             open_type = 1

   690             look_next = 1

   692         filecont = filecont + phrase

   694     return filecont

   697 #

   698 # make all whitespace into just one space

   699 # format the bibtex file into a usable form.

   700 #

   701 def bibtexwasher(filecont_source):

   703     space_rex = re.compile('\s+')

   704     comment_rex = re.compile('\s*%')

   706     filecont = []

   708     # remove trailing and excessive whitespace

   709     # ignore comments

   710     for line in filecont_source:

   711         line = string.strip(line)

   712         line = space_rex.sub(' ', line)

   713         # ignore comments

   714         if not comment_rex.match(line) and line != '':

   715             filecont.append(' '+ line)

   717     filecont = string.join(filecont, '')

   719     # the file is in one long string

   721     filecont = no_outer_parens(filecont)

   723     #

   724     # split lines according to preferred syntax scheme

   725     #

   726     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)

   728     # add new lines after commas that are after values

   729     filecont = re.sub('"\s*,', '",\n', filecont)

   730     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)

   731     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',

   732                           '\n\n\g<1>\g<2>,\n', filecont)

   734     # add new lines after }

   735     filecont = re.sub('"\s*}','"\n}\n', filecont)

   736     filecont = re.sub('}\s*,','},\n', filecont)

   739     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)

   741     # character encoding, reserved latex characters

   742     filecont = re.sub('{\\\&}', '&', filecont)

   743     filecont = re.sub('\\\&', '&', filecont)

   745     # do checking for open braces to get format correct

   746     open_brace_count = 0

   747     brace_split = re.split('([{}])',filecont)

   749     # rebuild filecont

   750     filecont = ''

   752     for phrase in brace_split:

   753         if phrase == '{':

   754             open_brace_count = open_brace_count + 1

   755         elif phrase == '}':

   756             open_brace_count = open_brace_count - 1

   757             if open_brace_count == 0:

   758                 filecont = filecont + '\n'

   760         filecont = filecont + phrase

   762     filecont2 = bibtex_replace_abbreviations(filecont)

   764     # gather

   765     filecont = filecont2.splitlines()

   766     i=0

   767     j=0         # count the number of blank lines

   768     for line in filecont:

   769         # ignore blank lines

   770         if line == '' or line == ' ':

   771             j = j+1

   772             continue

   773         filecont[i] = line + '\n'

   774         i = i+1

   776     # get rid of the extra stuff at the end of the array

   777     # (The extra stuff are duplicates that are in the array because

   778     # blank lines were removed.)

   779     length = len( filecont)

   780     filecont[length-j:length] = []

   782     return filecont

   785 def filehandler(filepath):

   786     try:

   787         fd = open(filepath, 'r')

   788         filecont_source = fd.readlines()

   789         fd.close()

   790     except:

   791         print 'Could not open file:', filepath

   792     washeddata = bibtexwasher(filecont_source)

   793     outdata = bibtexdecoder(washeddata)

   794     print '/**'

   795     print '\page references References'

   796     print

   797     for line in outdata:

   798         print line

   799     print '*/'

   802 # main program

   804 def main():

   805     import sys

   806     if sys.argv[1:]:

   807         filepath = sys.argv[1]

   808     else:

   809         print "No input file"

   810         sys.exit()

   811     filehandler(filepath)

   813 if __name__ == "__main__": main()

   816 # end python script

author	Alpar Juttner <alpar@cs.elte.hu>
	Mon, 10 Jan 2011 09:34:50 +0100
changeset 922	9312d6c89d02
parent 754	2de0fc630899
child 1052	eb2f9d453070
permissions	-rwxr-xr-x