lemon-1.3: scripts/bib2dox.py@f3bc4e9b5f3a

     1 #!/usr/bin/env /usr/local/Python/bin/python2.1

     2 """

     3   BibTeX to Doxygen converter

     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox

     6   This code is the modification of the BibTeX to XML converter

     7   by Vidar Bronken Gundersen et al. See the original copyright notices below.

     9   **********************************************************************

    11   Decoder for bibliographic data, BibTeX

    12   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml

    14   v.8

    15   (c)2002-06-23 Vidar Bronken Gundersen

    16   http://bibtexml.sf.net/

    17   Reuse approved as long as this notification is kept.

    18   Licence: GPL.

    20   Contributions/thanks to:

    21   Egon Willighagen, http://sf.net/projects/jreferences/

    22   Richard Mahoney (for providing a test case)

    24   Editted by Sara Sprenkle to be more robust and handle more bibtex features.

    25   (c) 2003-01-15

    27   1.  Changed bibtex: tags to bibxml: tags.

    28   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"

    29   3.  Allow spaces between @type and first {

    30   4.  "author" fields with multiple authors split by " and "

    31       are put in separate xml "bibxml:author" tags.

    32   5.  Option for Titles: words are capitalized

    33       only if first letter in title or capitalized inside braces

    34   6.  Removes braces from within field values

    35   7.  Ignores comments in bibtex file (including @comment{ or % )

    36   8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'

    37   9.  Handles bibtex @string abbreviations

    38         --> includes bibtex's default abbreviations for months

    39         --> does concatenation of abbr # " more " and " more " # abbr

    40   10. Handles @type( ... ) or @type{ ... }

    41   11. The keywords field is split on , or ; and put into separate xml

    42       "bibxml:keywords" tags

    43   12. Ignores @preamble

    45   Known Limitations

    46   1.  Does not transform Latex encoding like math mode and special

    47       latex symbols.

    48   2.  Does not parse author fields into first and last names.

    49       E.g., It does not do anything special to an author whose name is

    50       in the form LAST_NAME, FIRST_NAME

    51       In "author" tag, will show up as

    52       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>

    53   3.  Does not handle "crossref" fields other than to print

    54       <bibxml:crossref>...</bibxml:crossref>

    55   4.  Does not inform user of the input's format errors.  You just won't

    56       be able to transform the file later with XSL

    58   You will have to manually edit the XML output if you need to handle

    59   these (and unknown) limitations.

    61 """

    63 import string, re

    65 # set of valid name characters

    66 valid_name_chars = '[\w\-:]'

    68 #

    69 # define global regular expression variables

    70 #

    71 author_rex = re.compile('\s+and\s+')

    72 rembraces_rex = re.compile('[{}]')

    73 capitalize_rex = re.compile('({[^}]*})')

    75 # used by bibtexkeywords(data)

    76 keywords_rex = re.compile('[,;]')

    78 # used by concat_line(line)

    79 concatsplit_rex = re.compile('\s*#\s*')

    81 # split on {, }, or " in verify_out_of_braces

    82 delimiter_rex = re.compile('([{}"])',re.I)

    84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

    85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')

    87 url_rex = re.compile('\\\url\{([^}]*)\}')

    89 #

    90 # styles for html formatting

    91 #

    92 divstyle = 'margin-top: -4ex; margin-left: 8em;'

    94 #

    95 # return the string parameter without braces

    96 #

    97 def transformurls(str):

    98     return url_rex.sub(r'<a href="\1">\1</a>', str)

   100 #

   101 # return the string parameter without braces

   102 #

   103 def removebraces(str):

   104     return rembraces_rex.sub('', str)

   106 #

   107 # latex-specific replacements

   108 # (do this after braces were removed)

   109 #

   110 def latexreplacements(line):

   111     line = string.replace(line, '~', '&nbsp;')

   112     line = string.replace(line, '\\\'a', '&aacute;')

   113     line = string.replace(line, '\\"a', '&auml;')

   114     line = string.replace(line, '\\\'e', '&eacute;')

   115     line = string.replace(line, '\\"e', '&euml;')

   116     line = string.replace(line, '\\\'i', '&iacute;')

   117     line = string.replace(line, '\\"i', '&iuml;')

   118     line = string.replace(line, '\\\'o', '&oacute;')

   119     line = string.replace(line, '\\"o', '&ouml;')

   120     line = string.replace(line, '\\\'u', '&uacute;')

   121     line = string.replace(line, '\\"u', '&uuml;')

   122     line = string.replace(line, '\\H o', '&otilde;')

   123     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist

   124     line = string.replace(line, '\\\'A', '&Aacute;')

   125     line = string.replace(line, '\\"A', '&Auml;')

   126     line = string.replace(line, '\\\'E', '&Eacute;')

   127     line = string.replace(line, '\\"E', '&Euml;')

   128     line = string.replace(line, '\\\'I', '&Iacute;')

   129     line = string.replace(line, '\\"I', '&Iuml;')

   130     line = string.replace(line, '\\\'O', '&Oacute;')

   131     line = string.replace(line, '\\"O', '&Ouml;')

   132     line = string.replace(line, '\\\'U', '&Uacute;')

   133     line = string.replace(line, '\\"U', '&Uuml;')

   134     line = string.replace(line, '\\H O', '&Otilde;')

   135     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist

   137     return line

   139 #

   140 # copy characters form a string decoding html expressions (&xyz;)

   141 #

   142 def copychars(str, ifrom, count):

   143     result = ''

   144     i = ifrom

   145     c = 0

   146     html_spec = False

   147     while (i < len(str)) and (c < count):

   148         if str[i] == '&':

   149             html_spec = True;

   150             if i+1 < len(str):

   151                 result += str[i+1]

   152             c += 1

   153             i += 2

   154         else:

   155             if not html_spec:

   156                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \

   157                    ((str[i] >= 'a') and (str[i] <= 'z')):

   158                     result += str[i]

   159                     c += 1

   160             elif str[i] == ';':

   161                 html_spec = False;

   162             i += 1

   164     return result

   167 #

   168 # Handle a list of authors (separated by 'and').

   169 # It gives back an array of the follwing values:

   170 #  - num: the number of authors,

   171 #  - list: the list of the author names,

   172 #  - text: the bibtex text (separated by commas and/or 'and')

   173 #  - abbrev: abbreviation that can be used for indicate the

   174 #    bibliography entries

   175 #

   176 def bibtexauthor(data):

   177     result = {}

   178     bibtex = ''

   179     result['list'] = author_rex.split(data)

   180     result['num'] = len(result['list'])

   181     for i, author in enumerate(result['list']):

   182         # general transformations

   183         author = latexreplacements(removebraces(author.strip()))

   184         # transform "Xyz, A. B." to "A. B. Xyz"

   185         pos = author.find(',')

   186         if pos != -1:

   187             author = author[pos+1:].strip() + ' ' + author[:pos].strip()

   188         result['list'][i] = author

   189         bibtex += author + '#'

   190     bibtex = bibtex[:-1]

   191     if result['num'] > 1:

   192         ix = bibtex.rfind('#')

   193         if result['num'] == 2:

   194             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]

   195         else:

   196             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]

   197     bibtex = bibtex.replace('#', ', ')

   198     result['text'] = bibtex

   200     result['abbrev'] = ''

   201     for author in result['list']:

   202         pos = author.rfind(' ') + 1

   203         count = 1

   204         if result['num'] == 1:

   205             count = 3

   206         result['abbrev'] += copychars(author, pos, count)

   208     return result

   211 #

   212 # data = title string

   213 # @return the capitalized title (first letter is capitalized), rest are capitalized

   214 # only if capitalized inside braces

   215 #

   216 def capitalizetitle(data):

   217     title_list = capitalize_rex.split(data)

   218     title = ''

   219     count = 0

   220     for phrase in title_list:

   221          check = string.lstrip(phrase)

   223          # keep phrase's capitalization the same

   224          if check.find('{') == 0:

   225               title += removebraces(phrase)

   226          else:

   227          # first word --> capitalize first letter (after spaces)

   228               if count == 0:

   229                   title += check.capitalize()

   230               else:

   231                   title += phrase.lower()

   232          count = count + 1

   234     return title

   237 #

   238 # @return the bibtex for the title

   239 # @param data --> title string

   240 # braces are removed from title

   241 #

   242 def bibtextitle(data, entrytype):

   243     if entrytype in ('book', 'inbook'):

   244         title = removebraces(data.strip())

   245     else:

   246         title = removebraces(capitalizetitle(data.strip()))

   247     bibtex = title

   248     return bibtex

   251 #

   252 # function to compare entry lists

   253 #

   254 def entry_cmp(x, y):

   255     return cmp(x[0], y[0])

   258 #

   259 # print the XML for the transformed "filecont_source"

   260 #

   261 def bibtexdecoder(filecont_source):

   262     filecont = []

   263     file = []

   265     # want @<alphanumeric chars><spaces>{<spaces><any chars>,

   266     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')

   267     endtype_rex = re.compile('}\s*$')

   268     endtag_rex = re.compile('^\s*}\s*$')

   270     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

   271     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')

   273     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

   274     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')

   276     for line in filecont_source:

   277         line = line[:-1]

   279         # encode character entities

   280         line = string.replace(line, '&', '&amp;')

   281         line = string.replace(line, '<', '&lt;')

   282         line = string.replace(line, '>', '&gt;')

   284         # start entry: publication type (store for later use)

   285         if pubtype_rex.match(line):

   286         # want @<alphanumeric chars><spaces>{<spaces><any chars>,

   287             entrycont = {}

   288             entry = []

   289             entrytype = pubtype_rex.sub('\g<1>',line)

   290             entrytype = string.lower(entrytype)

   291             entryid   = pubtype_rex.sub('\g<2>', line)

   293         # end entry if just a }

   294         elif endtype_rex.match(line):

   295             # generate doxygen code for the entry

   297             # enty type related formattings

   298             if entrytype in ('book', 'inbook'):

   299                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'

   300                 if not entrycont.has_key('author'):

   301                     entrycont['author'] = entrycont['editor']

   302                     entrycont['author']['text'] += ', editors'

   303             elif entrytype == 'article':

   304                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'

   305             elif entrytype in ('inproceedings', 'incollection', 'conference'):

   306                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'

   307             elif entrytype == 'techreport':

   308                 if not entrycont.has_key('type'):

   309                     entrycont['type'] = 'Technical report'

   310             elif entrytype == 'mastersthesis':

   311                 entrycont['type'] = 'Master\'s thesis'

   312             elif entrytype == 'phdthesis':

   313                 entrycont['type'] = 'PhD thesis'

   315             for eline in entrycont:

   316                 if eline != '':

   317                     eline = latexreplacements(eline)

   319             if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   320                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')

   322             if entrycont.has_key('author') and (entrycont['author'] != ''):

   323                 entry.append(entrycont['author']['text'] + '.')

   324             if entrycont.has_key('title') and (entrycont['title'] != ''):

   325                 entry.append(entrycont['title'] + '.')

   326             if entrycont.has_key('journal') and (entrycont['journal'] != ''):

   327                 entry.append(entrycont['journal'] + ',')

   328             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):

   329                 entry.append('In ' + entrycont['booktitle'] + ',')

   330             if entrycont.has_key('type') and (entrycont['type'] != ''):

   331                 eline = entrycont['type']

   332                 if entrycont.has_key('number') and (entrycont['number'] != ''):

   333                     eline += ' ' + entrycont['number']

   334                 eline += ','

   335                 entry.append(eline)

   336             if entrycont.has_key('institution') and (entrycont['institution'] != ''):

   337                 entry.append(entrycont['institution'] + ',')

   338             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):

   339                 entry.append(entrycont['publisher'] + ',')

   340             if entrycont.has_key('school') and (entrycont['school'] != ''):

   341                 entry.append(entrycont['school'] + ',')

   342             if entrycont.has_key('address') and (entrycont['address'] != ''):

   343                 entry.append(entrycont['address'] + ',')

   344             if entrycont.has_key('edition') and (entrycont['edition'] != ''):

   345                 entry.append(entrycont['edition'] + ' edition,')

   346             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):

   347                 entry.append(entrycont['howpublished'] + ',')

   348             if entrycont.has_key('volume') and (entrycont['volume'] != ''):

   349                 eline = entrycont['volume'];

   350                 if entrycont.has_key('number') and (entrycont['number'] != ''):

   351                     eline += '(' + entrycont['number'] + ')'

   352                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   353                     eline += ':' + entrycont['pages']

   354                 eline += ','

   355                 entry.append(eline)

   356             else:

   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   358                     entry.append('pages ' + entrycont['pages'] + ',')

   359             if entrycont.has_key('year') and (entrycont['year'] != ''):

   360                 if entrycont.has_key('month') and (entrycont['month'] != ''):

   361                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')

   362                 else:

   363                     entry.append(entrycont['year'] + '.')

   364             if entrycont.has_key('note') and (entrycont['note'] != ''):

   365                 entry.append(entrycont['note'] + '.')

   366             if entrycont.has_key('url') and (entrycont['url'] != ''):

   367                 entry.append(entrycont['url'] + '.')

   369             # generate keys for sorting and for the output

   370             sortkey = ''

   371             bibkey = ''

   372             if entrycont.has_key('author'):

   373                 for author in entrycont['author']['list']:

   374                     sortkey += copychars(author, author.rfind(' ')+1, len(author))

   375                 bibkey = entrycont['author']['abbrev']

   376             else:

   377                 bibkey = 'x'

   378             if entrycont.has_key('year'):

   379                 sortkey += entrycont['year']

   380                 bibkey += entrycont['year'][-2:]

   381             if entrycont.has_key('title'):

   382                 sortkey += entrycont['title']

   383             if entrycont.has_key('key'):

   384                 sortkey = entrycont['key'] + sortkey

   385                 bibkey = entrycont['key']

   386             entry.insert(0, sortkey)

   387             entry.insert(1, bibkey)

   388             entry.insert(2, entryid)

   390             # add the entry to the file contents

   391             filecont.append(entry)

   393         else:

   394             # field, publication info

   395             field = ''

   396             data = ''

   398             # field = {data} entries

   399             if bracedata_rex.match(line):

   400                 field = bracefield_rex.sub('\g<1>', line)

   401                 field = string.lower(field)

   402                 data =  bracedata_rex.sub('\g<2>', line)

   404             # field = "data" entries

   405             elif quotedata_rex.match(line):

   406                 field = quotefield_rex.sub('\g<1>', line)

   407                 field = string.lower(field)

   408                 data =  quotedata_rex.sub('\g<2>', line)

   410             # field = data entries

   411             elif data_rex.match(line):

   412                 field = field_rex.sub('\g<1>', line)

   413                 field = string.lower(field)

   414                 data =  data_rex.sub('\g<2>', line)

   416             if field == 'url':

   417                 data = '\\url{' + data.strip() + '}'

   419             if field in ('author', 'editor'):

   420                 entrycont[field] = bibtexauthor(data)

   421                 line = ''

   422             elif field == 'title':

   423                 line = bibtextitle(data, entrytype)

   424             elif field != '':

   425                 line = removebraces(transformurls(data.strip()))

   427             if line != '':

   428                 line = latexreplacements(line)

   429                 entrycont[field] = line

   432     # sort entries

   433     filecont.sort(entry_cmp)

   435     # count the bibtex keys

   436     keytable = {}

   437     counttable = {}

   438     for entry in filecont:

   439         bibkey = entry[1]

   440         if not keytable.has_key(bibkey):

   441             keytable[bibkey] = 1

   442         else:

   443             keytable[bibkey] += 1

   445     for bibkey in keytable.keys():

   446         counttable[bibkey] = 0

   448     # generate output

   449     for entry in filecont:

   450         # generate output key form the bibtex key

   451         bibkey = entry[1]

   452         entryid = entry[2]

   453         if keytable[bibkey] == 1:

   454             outkey = bibkey

   455         else:

   456             outkey = bibkey + chr(97 + counttable[bibkey])

   457         counttable[bibkey] += 1

   459         # append the entry code to the output

   460         file.append('\\section ' + entryid + ' [' + outkey + ']')

   461         file.append('<div style="' + divstyle + '">')

   462         for line in entry[3:]:

   463             file.append(line)

   464         file.append('</div>')

   465         file.append('')

   467     return file

   470 #

   471 # return 1 iff abbr is in line but not inside braces or quotes

   472 # assumes that abbr appears only once on the line (out of braces and quotes)

   473 #

   474 def verify_out_of_braces(line, abbr):

   476     phrase_split = delimiter_rex.split(line)

   478     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)

   480     open_brace = 0

   481     open_quote = 0

   483     for phrase in phrase_split:

   484         if phrase == "{":

   485             open_brace = open_brace + 1

   486         elif phrase == "}":

   487             open_brace = open_brace - 1

   488         elif phrase == '"':

   489             if open_quote == 1:

   490                 open_quote = 0

   491             else:

   492                 open_quote = 1

   493         elif abbr_rex.search(phrase):

   494             if open_brace == 0 and open_quote == 0:

   495                 return 1

   497     return 0

   500 #

   501 # a line in the form phrase1 # phrase2 # ... # phrasen

   502 # is returned as phrase1 phrase2 ... phrasen

   503 # with the correct punctuation

   504 # Bug: Doesn't always work with multiple abbreviations plugged in

   505 #

   506 def concat_line(line):

   507     # only look at part after equals

   508     field = field_rex.sub('\g<1>',line)

   509     rest = field_rex.sub('\g<2>',line)

   511     concat_line = field + ' ='

   513     pound_split = concatsplit_rex.split(rest)

   515     phrase_count = 0

   516     length = len(pound_split)

   518     for phrase in pound_split:

   519         phrase = phrase.strip()

   520         if phrase_count != 0:

   521             if phrase.startswith('"') or phrase.startswith('{'):

   522                 phrase = phrase[1:]

   523         elif phrase.startswith('"'):

   524             phrase = phrase.replace('"','{',1)

   526         if phrase_count != length-1:

   527             if phrase.endswith('"') or phrase.endswith('}'):

   528                 phrase = phrase[:-1]

   529         else:

   530             if phrase.endswith('"'):

   531                 phrase = phrase[:-1]

   532                 phrase = phrase + "}"

   533             elif phrase.endswith('",'):

   534                 phrase = phrase[:-2]

   535                 phrase = phrase + "},"

   537         # if phrase did have \#, add the \# back

   538         if phrase.endswith('\\'):

   539             phrase = phrase + "#"

   540         concat_line = concat_line + ' ' + phrase

   542         phrase_count = phrase_count + 1

   544     return concat_line

   547 #

   548 # substitute abbreviations into filecont

   549 # @param filecont_source - string of data from file

   550 #

   551 def bibtex_replace_abbreviations(filecont_source):

   552     filecont = filecont_source.splitlines()

   554     #  These are defined in bibtex, so we'll define them too

   555     abbr_list = ['jan','feb','mar','apr','may','jun',

   556                  'jul','aug','sep','oct','nov','dec']

   557     value_list = ['January','February','March','April',

   558                   'May','June','July','August','September',

   559                   'October','November','December']

   561     abbr_rex = []

   562     total_abbr_count = 0

   564     front = '\\b'

   565     back = '(,?)\\b'

   567     for x in abbr_list:

   568         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )

   569         total_abbr_count = total_abbr_count + 1

   572     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',

   573                              re.I)

   575     comment_rex = re.compile('@comment\s*{',re.I)

   576     preamble_rex = re.compile('@preamble\s*{',re.I)

   578     waiting_for_end_string = 0

   579     i = 0

   580     filecont2 = ''

   582     for line in filecont:

   583         if line == ' ' or line == '':

   584             continue

   586         if waiting_for_end_string:

   587             if re.search('}',line):

   588                 waiting_for_end_string = 0

   589                 continue

   591         if abbrdef_rex.search(line):

   592             abbr = abbrdef_rex.sub('\g<1>', line)

   594             if abbr_list.count(abbr) == 0:

   595                 val = abbrdef_rex.sub('\g<2>', line)

   596                 abbr_list.append(abbr)

   597                 value_list.append(string.strip(val))

   598                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )

   599                 total_abbr_count = total_abbr_count + 1

   600             waiting_for_end_string = 1

   601             continue

   603         if comment_rex.search(line):

   604             waiting_for_end_string = 1

   605             continue

   607         if preamble_rex.search(line):

   608             waiting_for_end_string = 1

   609             continue

   612         # replace subsequent abbreviations with the value

   613         abbr_count = 0

   615         for x in abbr_list:

   617             if abbr_rex[abbr_count].search(line):

   618                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:

   619                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)

   620                 # Check for # concatenations

   621                 if concatsplit_rex.search(line):

   622                     line = concat_line(line)

   623             abbr_count = abbr_count + 1

   626         filecont2 = filecont2 + line + '\n'

   627         i = i+1

   630     # Do one final pass over file

   632     # make sure that didn't end up with {" or }" after the substitution

   633     filecont2 = filecont2.replace('{"','{{')

   634     filecont2 = filecont2.replace('"}','}}')

   636     afterquotevalue_rex = re.compile('"\s*,\s*')

   637     afterbrace_rex = re.compile('"\s*}')

   638     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')

   640     # add new lines to data that changed because of abbreviation substitutions

   641     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)

   642     filecont2 = afterbrace_rex.sub('"\n}', filecont2)

   643     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)

   645     return filecont2

   647 #

   648 # convert @type( ... ) to @type{ ... }

   649 #

   650 def no_outer_parens(filecont):

   652     # do checking for open parens

   653     # will convert to braces

   654     paren_split = re.split('([(){}])',filecont)

   656     open_paren_count = 0

   657     open_type = 0

   658     look_next = 0

   660     # rebuild filecont

   661     filecont = ''

   663     at_rex = re.compile('@\w*')

   665     for phrase in paren_split:

   666         if look_next == 1:

   667             if phrase == '(':

   668                 phrase = '{'

   669                 open_paren_count = open_paren_count + 1

   670             else:

   671                 open_type = 0

   672             look_next = 0

   674         if phrase == '(':

   675             open_paren_count = open_paren_count + 1

   677         elif phrase == ')':

   678             open_paren_count = open_paren_count - 1

   679             if open_type == 1 and open_paren_count == 0:

   680                 phrase = '}'

   681                 open_type = 0

   683         elif at_rex.search( phrase ):

   684             open_type = 1

   685             look_next = 1

   687         filecont = filecont + phrase

   689     return filecont

   692 #

   693 # make all whitespace into just one space

   694 # format the bibtex file into a usable form.

   695 #

   696 def bibtexwasher(filecont_source):

   698     space_rex = re.compile('\s+')

   699     comment_rex = re.compile('\s*%')

   701     filecont = []

   703     # remove trailing and excessive whitespace

   704     # ignore comments

   705     for line in filecont_source:

   706         line = string.strip(line)

   707         line = space_rex.sub(' ', line)

   708         # ignore comments

   709         if not comment_rex.match(line) and line != '':

   710             filecont.append(' '+ line)

   712     filecont = string.join(filecont, '')

   714     # the file is in one long string

   716     filecont = no_outer_parens(filecont)

   718     #

   719     # split lines according to preferred syntax scheme

   720     #

   721     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)

   723     # add new lines after commas that are after values

   724     filecont = re.sub('"\s*,', '",\n', filecont)

   725     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)

   726     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',

   727                           '\n\n\g<1>\g<2>,\n', filecont)

   729     # add new lines after }

   730     filecont = re.sub('"\s*}','"\n}\n', filecont)

   731     filecont = re.sub('}\s*,','},\n', filecont)

   734     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)

   736     # character encoding, reserved latex characters

   737     filecont = re.sub('{\\\&}', '&', filecont)

   738     filecont = re.sub('\\\&', '&', filecont)

   740     # do checking for open braces to get format correct

   741     open_brace_count = 0

   742     brace_split = re.split('([{}])',filecont)

   744     # rebuild filecont

   745     filecont = ''

   747     for phrase in brace_split:

   748         if phrase == '{':

   749             open_brace_count = open_brace_count + 1

   750         elif phrase == '}':

   751             open_brace_count = open_brace_count - 1

   752             if open_brace_count == 0:

   753                 filecont = filecont + '\n'

   755         filecont = filecont + phrase

   757     filecont2 = bibtex_replace_abbreviations(filecont)

   759     # gather

   760     filecont = filecont2.splitlines()

   761     i=0

   762     j=0         # count the number of blank lines

   763     for line in filecont:

   764         # ignore blank lines

   765         if line == '' or line == ' ':

   766             j = j+1

   767             continue

   768         filecont[i] = line + '\n'

   769         i = i+1

   771     # get rid of the extra stuff at the end of the array

   772     # (The extra stuff are duplicates that are in the array because

   773     # blank lines were removed.)

   774     length = len( filecont)

   775     filecont[length-j:length] = []

   777     return filecont

   780 def filehandler(filepath):

   781     try:

   782         fd = open(filepath, 'r')

   783         filecont_source = fd.readlines()

   784         fd.close()

   785     except:

   786         print 'Could not open file:', filepath

   787     washeddata = bibtexwasher(filecont_source)

   788     outdata = bibtexdecoder(washeddata)

   789     print '/**'

   790     print '\page references References'

   791     print

   792     for line in outdata:

   793         print line

   794     print '*/'

   797 # main program

   799 def main():

   800     import sys

   801     if sys.argv[1:]:

   802         filepath = sys.argv[1]

   803     else:

   804         print "No input file"

   805         sys.exit()

   806     filehandler(filepath)

   808 if __name__ == "__main__": main()

   811 # end python script

author	Peter Kovacs <kpeter@inf.elte.hu>
	Sat, 20 Feb 2010 18:39:03 +0100
changeset 839	f3bc4e9b5f3a
parent 745	68792fb2870f
child 836	c841ae1aca29
permissions	-rwxr-xr-x