| 1 | | #! /usr/bin/env python |
| 2 | | """ |
| 3 | | BibTeX to Doxygen converter |
| 4 | | Usage: python bib2dox.py bibfile.bib > bibfile.dox |
| 5 | | |
| 6 | | This file is a part of LEMON, a generic C++ optimization library. |
| 7 | | |
| 8 | | ********************************************************************** |
| 9 | | |
| 10 | | This code is the modification of the BibTeX to XML converter |
| 11 | | by Vidar Bronken Gundersen et al. |
| 12 | | See the original copyright notices below. |
| 13 | | |
| 14 | | ********************************************************************** |
| 15 | | |
| 16 | | Decoder for bibliographic data, BibTeX |
| 17 | | Usage: python bibtex2xml.py bibfile.bib > bibfile.xml |
| 18 | | |
| 19 | | v.8 |
| 20 | | (c)2002-06-23 Vidar Bronken Gundersen |
| 21 | | http://bibtexml.sf.net/ |
| 22 | | Reuse approved as long as this notification is kept. |
| 23 | | Licence: GPL. |
| 24 | | |
| 25 | | Contributions/thanks to: |
| 26 | | Egon Willighagen, http://sf.net/projects/jreferences/ |
| 27 | | Richard Mahoney (for providing a test case) |
| 28 | | |
| 29 | | Editted by Sara Sprenkle to be more robust and handle more bibtex features. |
| 30 | | (c) 2003-01-15 |
| 31 | | |
| 32 | | 1. Changed bibtex: tags to bibxml: tags. |
| 33 | | 2. Use xmlns:bibxml="http://bibtexml.sf.net/" |
| 34 | | 3. Allow spaces between @type and first { |
| 35 | | 4. "author" fields with multiple authors split by " and " |
| 36 | | are put in separate xml "bibxml:author" tags. |
| 37 | | 5. Option for Titles: words are capitalized |
| 38 | | only if first letter in title or capitalized inside braces |
| 39 | | 6. Removes braces from within field values |
| 40 | | 7. Ignores comments in bibtex file (including @comment{ or % ) |
| 41 | | 8. Replaces some special latex tags, e.g., replaces ~ with ' ' |
| 42 | | 9. Handles bibtex @string abbreviations |
| 43 | | --> includes bibtex's default abbreviations for months |
| 44 | | --> does concatenation of abbr # " more " and " more " # abbr |
| 45 | | 10. Handles @type( ... ) or @type{ ... } |
| 46 | | 11. The keywords field is split on , or ; and put into separate xml |
| 47 | | "bibxml:keywords" tags |
| 48 | | 12. Ignores @preamble |
| 49 | | |
| 50 | | Known Limitations |
| 51 | | 1. Does not transform Latex encoding like math mode and special |
| 52 | | latex symbols. |
| 53 | | 2. Does not parse author fields into first and last names. |
| 54 | | E.g., It does not do anything special to an author whose name is |
| 55 | | in the form LAST_NAME, FIRST_NAME |
| 56 | | In "author" tag, will show up as |
| 57 | | <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author> |
| 58 | | 3. Does not handle "crossref" fields other than to print |
| 59 | | <bibxml:crossref>...</bibxml:crossref> |
| 60 | | 4. Does not inform user of the input's format errors. You just won't |
| 61 | | be able to transform the file later with XSL |
| 62 | | |
| 63 | | You will have to manually edit the XML output if you need to handle |
| 64 | | these (and unknown) limitations. |
| 65 | | |
| 66 | | """ |
| 67 | | |
| 68 | | import string, re |
| 69 | | |
| 70 | | # set of valid name characters |
| 71 | | valid_name_chars = '[\w\-:]' |
| 72 | | |
| 73 | | # |
| 74 | | # define global regular expression variables |
| 75 | | # |
| 76 | | author_rex = re.compile('\s+and\s+') |
| 77 | | rembraces_rex = re.compile('[{}]') |
| 78 | | capitalize_rex = re.compile('({[^}]*})') |
| 79 | | |
| 80 | | # used by bibtexkeywords(data) |
| 81 | | keywords_rex = re.compile('[,;]') |
| 82 | | |
| 83 | | # used by concat_line(line) |
| 84 | | concatsplit_rex = re.compile('\s*#\s*') |
| 85 | | |
| 86 | | # split on {, }, or " in verify_out_of_braces |
| 87 | | delimiter_rex = re.compile('([{}"])',re.I) |
| 88 | | |
| 89 | | field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
| 90 | | data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') |
| 91 | | |
| 92 | | url_rex = re.compile('\\\url\{([^}]*)\}') |
| 93 | | |
| 94 | | # |
| 95 | | # styles for html formatting |
| 96 | | # |
| 97 | | divstyle = 'margin-top: -4ex; margin-left: 10em;' |
| 98 | | |
| 99 | | # |
| 100 | | # return the string parameter without braces |
| 101 | | # |
| 102 | | def transformurls(str): |
| 103 | | return url_rex.sub(r'<a href="\1">\1</a>', str) |
| 104 | | |
| 105 | | # |
| 106 | | # return the string parameter without braces |
| 107 | | # |
| 108 | | def removebraces(str): |
| 109 | | return rembraces_rex.sub('', str) |
| 110 | | |
| 111 | | # |
| 112 | | # latex-specific replacements |
| 113 | | # (do this after braces were removed) |
| 114 | | # |
| 115 | | def latexreplacements(line): |
| 116 | | line = string.replace(line, '~', ' ') |
| 117 | | line = string.replace(line, '\\\'a', 'á') |
| 118 | | line = string.replace(line, '\\"a', 'ä') |
| 119 | | line = string.replace(line, '\\\'e', 'é') |
| 120 | | line = string.replace(line, '\\"e', 'ë') |
| 121 | | line = string.replace(line, '\\\'i', 'í') |
| 122 | | line = string.replace(line, '\\"i', 'ï') |
| 123 | | line = string.replace(line, '\\\'o', 'ó') |
| 124 | | line = string.replace(line, '\\"o', 'ö') |
| 125 | | line = string.replace(line, '\\\'u', 'ú') |
| 126 | | line = string.replace(line, '\\"u', 'ü') |
| 127 | | line = string.replace(line, '\\H o', 'õ') |
| 128 | | line = string.replace(line, '\\H u', 'ü') # ũ does not exist |
| 129 | | line = string.replace(line, '\\\'A', 'Á') |
| 130 | | line = string.replace(line, '\\"A', 'Ä') |
| 131 | | line = string.replace(line, '\\\'E', 'É') |
| 132 | | line = string.replace(line, '\\"E', 'Ë') |
| 133 | | line = string.replace(line, '\\\'I', 'Í') |
| 134 | | line = string.replace(line, '\\"I', 'Ï') |
| 135 | | line = string.replace(line, '\\\'O', 'Ó') |
| 136 | | line = string.replace(line, '\\"O', 'Ö') |
| 137 | | line = string.replace(line, '\\\'U', 'Ú') |
| 138 | | line = string.replace(line, '\\"U', 'Ü') |
| 139 | | line = string.replace(line, '\\H O', 'Õ') |
| 140 | | line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist |
| 141 | | |
| 142 | | return line |
| 143 | | |
| 144 | | # |
| 145 | | # copy characters form a string decoding html expressions (&xyz;) |
| 146 | | # |
| 147 | | def copychars(str, ifrom, count): |
| 148 | | result = '' |
| 149 | | i = ifrom |
| 150 | | c = 0 |
| 151 | | html_spec = False |
| 152 | | while (i < len(str)) and (c < count): |
| 153 | | if str[i] == '&': |
| 154 | | html_spec = True; |
| 155 | | if i+1 < len(str): |
| 156 | | result += str[i+1] |
| 157 | | c += 1 |
| 158 | | i += 2 |
| 159 | | else: |
| 160 | | if not html_spec: |
| 161 | | if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ |
| 162 | | ((str[i] >= 'a') and (str[i] <= 'z')): |
| 163 | | result += str[i] |
| 164 | | c += 1 |
| 165 | | elif str[i] == ';': |
| 166 | | html_spec = False; |
| 167 | | i += 1 |
| 168 | | |
| 169 | | return result |
| 170 | | |
| 171 | | |
| 172 | | # |
| 173 | | # Handle a list of authors (separated by 'and'). |
| 174 | | # It gives back an array of the follwing values: |
| 175 | | # - num: the number of authors, |
| 176 | | # - list: the list of the author names, |
| 177 | | # - text: the bibtex text (separated by commas and/or 'and') |
| 178 | | # - abbrev: abbreviation that can be used for indicate the |
| 179 | | # bibliography entries |
| 180 | | # |
| 181 | | def bibtexauthor(data): |
| 182 | | result = {} |
| 183 | | bibtex = '' |
| 184 | | result['list'] = author_rex.split(data) |
| 185 | | result['num'] = len(result['list']) |
| 186 | | for i, author in enumerate(result['list']): |
| 187 | | # general transformations |
| 188 | | author = latexreplacements(removebraces(author.strip())) |
| 189 | | # transform "Xyz, A. B." to "A. B. Xyz" |
| 190 | | pos = author.find(',') |
| 191 | | if pos != -1: |
| 192 | | author = author[pos+1:].strip() + ' ' + author[:pos].strip() |
| 193 | | result['list'][i] = author |
| 194 | | bibtex += author + '#' |
| 195 | | bibtex = bibtex[:-1] |
| 196 | | if result['num'] > 1: |
| 197 | | ix = bibtex.rfind('#') |
| 198 | | if result['num'] == 2: |
| 199 | | bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] |
| 200 | | else: |
| 201 | | bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] |
| 202 | | bibtex = bibtex.replace('#', ', ') |
| 203 | | result['text'] = bibtex |
| 204 | | |
| 205 | | result['abbrev'] = '' |
| 206 | | for author in result['list']: |
| 207 | | pos = author.rfind(' ') + 1 |
| 208 | | count = 1 |
| 209 | | if result['num'] == 1: |
| 210 | | count = 3 |
| 211 | | result['abbrev'] += copychars(author, pos, count) |
| 212 | | |
| 213 | | return result |
| 214 | | |
| 215 | | |
| 216 | | # |
| 217 | | # data = title string |
| 218 | | # @return the capitalized title (first letter is capitalized), rest are capitalized |
| 219 | | # only if capitalized inside braces |
| 220 | | # |
| 221 | | def capitalizetitle(data): |
| 222 | | title_list = capitalize_rex.split(data) |
| 223 | | title = '' |
| 224 | | count = 0 |
| 225 | | for phrase in title_list: |
| 226 | | check = string.lstrip(phrase) |
| 227 | | |
| 228 | | # keep phrase's capitalization the same |
| 229 | | if check.find('{') == 0: |
| 230 | | title += removebraces(phrase) |
| 231 | | else: |
| 232 | | # first word --> capitalize first letter (after spaces) |
| 233 | | if count == 0: |
| 234 | | title += check.capitalize() |
| 235 | | else: |
| 236 | | title += phrase.lower() |
| 237 | | count = count + 1 |
| 238 | | |
| 239 | | return title |
| 240 | | |
| 241 | | |
| 242 | | # |
| 243 | | # @return the bibtex for the title |
| 244 | | # @param data --> title string |
| 245 | | # braces are removed from title |
| 246 | | # |
| 247 | | def bibtextitle(data, entrytype): |
| 248 | | if entrytype in ('book', 'inbook'): |
| 249 | | title = removebraces(data.strip()) |
| 250 | | else: |
| 251 | | title = removebraces(capitalizetitle(data.strip())) |
| 252 | | bibtex = title |
| 253 | | return bibtex |
| 254 | | |
| 255 | | |
| 256 | | # |
| 257 | | # function to compare entry lists |
| 258 | | # |
| 259 | | def entry_cmp(x, y): |
| 260 | | return cmp(x[0], y[0]) |
| 261 | | |
| 262 | | |
| 263 | | # |
| 264 | | # print the XML for the transformed "filecont_source" |
| 265 | | # |
| 266 | | def bibtexdecoder(filecont_source): |
| 267 | | filecont = [] |
| 268 | | file = [] |
| 269 | | |
| 270 | | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
| 271 | | pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') |
| 272 | | endtype_rex = re.compile('}\s*$') |
| 273 | | endtag_rex = re.compile('^\s*}\s*$') |
| 274 | | |
| 275 | | bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
| 276 | | bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') |
| 277 | | |
| 278 | | quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
| 279 | | quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') |
| 280 | | |
| 281 | | for line in filecont_source: |
| 282 | | line = line[:-1] |
| 283 | | |
| 284 | | # encode character entities |
| 285 | | line = string.replace(line, '&', '&') |
| 286 | | line = string.replace(line, '<', '<') |
| 287 | | line = string.replace(line, '>', '>') |
| 288 | | |
| 289 | | # start entry: publication type (store for later use) |
| 290 | | if pubtype_rex.match(line): |
| 291 | | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
| 292 | | entrycont = {} |
| 293 | | entry = [] |
| 294 | | entrytype = pubtype_rex.sub('\g<1>',line) |
| 295 | | entrytype = string.lower(entrytype) |
| 296 | | entryid = pubtype_rex.sub('\g<2>', line) |
| 297 | | |
| 298 | | # end entry if just a } |
| 299 | | elif endtype_rex.match(line): |
| 300 | | # generate doxygen code for the entry |
| 301 | | |
| 302 | | # enty type related formattings |
| 303 | | if entrytype in ('book', 'inbook'): |
| 304 | | entrycont['title'] = '<em>' + entrycont['title'] + '</em>' |
| 305 | | if not entrycont.has_key('author'): |
| 306 | | entrycont['author'] = entrycont['editor'] |
| 307 | | entrycont['author']['text'] += ', editors' |
| 308 | | elif entrytype == 'article': |
| 309 | | entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>' |
| 310 | | elif entrytype in ('inproceedings', 'incollection', 'conference'): |
| 311 | | entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>' |
| 312 | | elif entrytype == 'techreport': |
| 313 | | if not entrycont.has_key('type'): |
| 314 | | entrycont['type'] = 'Technical report' |
| 315 | | elif entrytype == 'mastersthesis': |
| 316 | | entrycont['type'] = 'Master\'s thesis' |
| 317 | | elif entrytype == 'phdthesis': |
| 318 | | entrycont['type'] = 'PhD thesis' |
| 319 | | |
| 320 | | for eline in entrycont: |
| 321 | | if eline != '': |
| 322 | | eline = latexreplacements(eline) |
| 323 | | |
| 324 | | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
| 325 | | entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') |
| 326 | | |
| 327 | | if entrycont.has_key('author') and (entrycont['author'] != ''): |
| 328 | | entry.append(entrycont['author']['text'] + '.') |
| 329 | | if entrycont.has_key('title') and (entrycont['title'] != ''): |
| 330 | | entry.append(entrycont['title'] + '.') |
| 331 | | if entrycont.has_key('journal') and (entrycont['journal'] != ''): |
| 332 | | entry.append(entrycont['journal'] + ',') |
| 333 | | if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): |
| 334 | | entry.append('In ' + entrycont['booktitle'] + ',') |
| 335 | | if entrycont.has_key('type') and (entrycont['type'] != ''): |
| 336 | | eline = entrycont['type'] |
| 337 | | if entrycont.has_key('number') and (entrycont['number'] != ''): |
| 338 | | eline += ' ' + entrycont['number'] |
| 339 | | eline += ',' |
| 340 | | entry.append(eline) |
| 341 | | if entrycont.has_key('institution') and (entrycont['institution'] != ''): |
| 342 | | entry.append(entrycont['institution'] + ',') |
| 343 | | if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): |
| 344 | | entry.append(entrycont['publisher'] + ',') |
| 345 | | if entrycont.has_key('school') and (entrycont['school'] != ''): |
| 346 | | entry.append(entrycont['school'] + ',') |
| 347 | | if entrycont.has_key('address') and (entrycont['address'] != ''): |
| 348 | | entry.append(entrycont['address'] + ',') |
| 349 | | if entrycont.has_key('edition') and (entrycont['edition'] != ''): |
| 350 | | entry.append(entrycont['edition'] + ' edition,') |
| 351 | | if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): |
| 352 | | entry.append(entrycont['howpublished'] + ',') |
| 353 | | if entrycont.has_key('volume') and (entrycont['volume'] != ''): |
| 354 | | eline = entrycont['volume']; |
| 355 | | if entrycont.has_key('number') and (entrycont['number'] != ''): |
| 356 | | eline += '(' + entrycont['number'] + ')' |
| 357 | | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
| 358 | | eline += ':' + entrycont['pages'] |
| 359 | | eline += ',' |
| 360 | | entry.append(eline) |
| 361 | | else: |
| 362 | | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
| 363 | | entry.append('pages ' + entrycont['pages'] + ',') |
| 364 | | if entrycont.has_key('year') and (entrycont['year'] != ''): |
| 365 | | if entrycont.has_key('month') and (entrycont['month'] != ''): |
| 366 | | entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') |
| 367 | | else: |
| 368 | | entry.append(entrycont['year'] + '.') |
| 369 | | if entrycont.has_key('note') and (entrycont['note'] != ''): |
| 370 | | entry.append(entrycont['note'] + '.') |
| 371 | | if entrycont.has_key('url') and (entrycont['url'] != ''): |
| 372 | | entry.append(entrycont['url'] + '.') |
| 373 | | |
| 374 | | # generate keys for sorting and for the output |
| 375 | | sortkey = '' |
| 376 | | bibkey = '' |
| 377 | | if entrycont.has_key('author'): |
| 378 | | for author in entrycont['author']['list']: |
| 379 | | sortkey += copychars(author, author.rfind(' ')+1, len(author)) |
| 380 | | bibkey = entrycont['author']['abbrev'] |
| 381 | | else: |
| 382 | | bibkey = 'x' |
| 383 | | if entrycont.has_key('year'): |
| 384 | | sortkey += entrycont['year'] |
| 385 | | bibkey += entrycont['year'][-2:] |
| 386 | | if entrycont.has_key('title'): |
| 387 | | sortkey += entrycont['title'] |
| 388 | | if entrycont.has_key('key'): |
| 389 | | sortkey = entrycont['key'] + sortkey |
| 390 | | bibkey = entrycont['key'] |
| 391 | | entry.insert(0, sortkey) |
| 392 | | entry.insert(1, bibkey) |
| 393 | | entry.insert(2, entryid) |
| 394 | | |
| 395 | | # add the entry to the file contents |
| 396 | | filecont.append(entry) |
| 397 | | |
| 398 | | else: |
| 399 | | # field, publication info |
| 400 | | field = '' |
| 401 | | data = '' |
| 402 | | |
| 403 | | # field = {data} entries |
| 404 | | if bracedata_rex.match(line): |
| 405 | | field = bracefield_rex.sub('\g<1>', line) |
| 406 | | field = string.lower(field) |
| 407 | | data = bracedata_rex.sub('\g<2>', line) |
| 408 | | |
| 409 | | # field = "data" entries |
| 410 | | elif quotedata_rex.match(line): |
| 411 | | field = quotefield_rex.sub('\g<1>', line) |
| 412 | | field = string.lower(field) |
| 413 | | data = quotedata_rex.sub('\g<2>', line) |
| 414 | | |
| 415 | | # field = data entries |
| 416 | | elif data_rex.match(line): |
| 417 | | field = field_rex.sub('\g<1>', line) |
| 418 | | field = string.lower(field) |
| 419 | | data = data_rex.sub('\g<2>', line) |
| 420 | | |
| 421 | | if field == 'url': |
| 422 | | data = '\\url{' + data.strip() + '}' |
| 423 | | |
| 424 | | if field in ('author', 'editor'): |
| 425 | | entrycont[field] = bibtexauthor(data) |
| 426 | | line = '' |
| 427 | | elif field == 'title': |
| 428 | | line = bibtextitle(data, entrytype) |
| 429 | | elif field != '': |
| 430 | | line = removebraces(transformurls(data.strip())) |
| 431 | | |
| 432 | | if line != '': |
| 433 | | line = latexreplacements(line) |
| 434 | | entrycont[field] = line |
| 435 | | |
| 436 | | |
| 437 | | # sort entries |
| 438 | | filecont.sort(entry_cmp) |
| 439 | | |
| 440 | | # count the bibtex keys |
| 441 | | keytable = {} |
| 442 | | counttable = {} |
| 443 | | for entry in filecont: |
| 444 | | bibkey = entry[1] |
| 445 | | if not keytable.has_key(bibkey): |
| 446 | | keytable[bibkey] = 1 |
| 447 | | else: |
| 448 | | keytable[bibkey] += 1 |
| 449 | | |
| 450 | | for bibkey in keytable.keys(): |
| 451 | | counttable[bibkey] = 0 |
| 452 | | |
| 453 | | # generate output |
| 454 | | for entry in filecont: |
| 455 | | # generate output key form the bibtex key |
| 456 | | bibkey = entry[1] |
| 457 | | entryid = entry[2] |
| 458 | | if keytable[bibkey] == 1: |
| 459 | | outkey = bibkey |
| 460 | | else: |
| 461 | | outkey = bibkey + chr(97 + counttable[bibkey]) |
| 462 | | counttable[bibkey] += 1 |
| 463 | | |
| 464 | | # append the entry code to the output |
| 465 | | file.append('\\section ' + entryid + ' [' + outkey + ']') |
| 466 | | file.append('<div style="' + divstyle + '">') |
| 467 | | for line in entry[3:]: |
| 468 | | file.append(line) |
| 469 | | file.append('</div>') |
| 470 | | file.append('') |
| 471 | | |
| 472 | | return file |
| 473 | | |
| 474 | | |
| 475 | | # |
| 476 | | # return 1 iff abbr is in line but not inside braces or quotes |
| 477 | | # assumes that abbr appears only once on the line (out of braces and quotes) |
| 478 | | # |
| 479 | | def verify_out_of_braces(line, abbr): |
| 480 | | |
| 481 | | phrase_split = delimiter_rex.split(line) |
| 482 | | |
| 483 | | abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) |
| 484 | | |
| 485 | | open_brace = 0 |
| 486 | | open_quote = 0 |
| 487 | | |
| 488 | | for phrase in phrase_split: |
| 489 | | if phrase == "{": |
| 490 | | open_brace = open_brace + 1 |
| 491 | | elif phrase == "}": |
| 492 | | open_brace = open_brace - 1 |
| 493 | | elif phrase == '"': |
| 494 | | if open_quote == 1: |
| 495 | | open_quote = 0 |
| 496 | | else: |
| 497 | | open_quote = 1 |
| 498 | | elif abbr_rex.search(phrase): |
| 499 | | if open_brace == 0 and open_quote == 0: |
| 500 | | return 1 |
| 501 | | |
| 502 | | return 0 |
| 503 | | |
| 504 | | |
| 505 | | # |
| 506 | | # a line in the form phrase1 # phrase2 # ... # phrasen |
| 507 | | # is returned as phrase1 phrase2 ... phrasen |
| 508 | | # with the correct punctuation |
| 509 | | # Bug: Doesn't always work with multiple abbreviations plugged in |
| 510 | | # |
| 511 | | def concat_line(line): |
| 512 | | # only look at part after equals |
| 513 | | field = field_rex.sub('\g<1>',line) |
| 514 | | rest = field_rex.sub('\g<2>',line) |
| 515 | | |
| 516 | | concat_line = field + ' =' |
| 517 | | |
| 518 | | pound_split = concatsplit_rex.split(rest) |
| 519 | | |
| 520 | | phrase_count = 0 |
| 521 | | length = len(pound_split) |
| 522 | | |
| 523 | | for phrase in pound_split: |
| 524 | | phrase = phrase.strip() |
| 525 | | if phrase_count != 0: |
| 526 | | if phrase.startswith('"') or phrase.startswith('{'): |
| 527 | | phrase = phrase[1:] |
| 528 | | elif phrase.startswith('"'): |
| 529 | | phrase = phrase.replace('"','{',1) |
| 530 | | |
| 531 | | if phrase_count != length-1: |
| 532 | | if phrase.endswith('"') or phrase.endswith('}'): |
| 533 | | phrase = phrase[:-1] |
| 534 | | else: |
| 535 | | if phrase.endswith('"'): |
| 536 | | phrase = phrase[:-1] |
| 537 | | phrase = phrase + "}" |
| 538 | | elif phrase.endswith('",'): |
| 539 | | phrase = phrase[:-2] |
| 540 | | phrase = phrase + "}," |
| 541 | | |
| 542 | | # if phrase did have \#, add the \# back |
| 543 | | if phrase.endswith('\\'): |
| 544 | | phrase = phrase + "#" |
| 545 | | concat_line = concat_line + ' ' + phrase |
| 546 | | |
| 547 | | phrase_count = phrase_count + 1 |
| 548 | | |
| 549 | | return concat_line |
| 550 | | |
| 551 | | |
| 552 | | # |
| 553 | | # substitute abbreviations into filecont |
| 554 | | # @param filecont_source - string of data from file |
| 555 | | # |
| 556 | | def bibtex_replace_abbreviations(filecont_source): |
| 557 | | filecont = filecont_source.splitlines() |
| 558 | | |
| 559 | | # These are defined in bibtex, so we'll define them too |
| 560 | | abbr_list = ['jan','feb','mar','apr','may','jun', |
| 561 | | 'jul','aug','sep','oct','nov','dec'] |
| 562 | | value_list = ['January','February','March','April', |
| 563 | | 'May','June','July','August','September', |
| 564 | | 'October','November','December'] |
| 565 | | |
| 566 | | abbr_rex = [] |
| 567 | | total_abbr_count = 0 |
| 568 | | |
| 569 | | front = '\\b' |
| 570 | | back = '(,?)\\b' |
| 571 | | |
| 572 | | for x in abbr_list: |
| 573 | | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
| 574 | | total_abbr_count = total_abbr_count + 1 |
| 575 | | |
| 576 | | |
| 577 | | abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', |
| 578 | | re.I) |
| 579 | | |
| 580 | | comment_rex = re.compile('@comment\s*{',re.I) |
| 581 | | preamble_rex = re.compile('@preamble\s*{',re.I) |
| 582 | | |
| 583 | | waiting_for_end_string = 0 |
| 584 | | i = 0 |
| 585 | | filecont2 = '' |
| 586 | | |
| 587 | | for line in filecont: |
| 588 | | if line == ' ' or line == '': |
| 589 | | continue |
| 590 | | |
| 591 | | if waiting_for_end_string: |
| 592 | | if re.search('}',line): |
| 593 | | waiting_for_end_string = 0 |
| 594 | | continue |
| 595 | | |
| 596 | | if abbrdef_rex.search(line): |
| 597 | | abbr = abbrdef_rex.sub('\g<1>', line) |
| 598 | | |
| 599 | | if abbr_list.count(abbr) == 0: |
| 600 | | val = abbrdef_rex.sub('\g<2>', line) |
| 601 | | abbr_list.append(abbr) |
| 602 | | value_list.append(string.strip(val)) |
| 603 | | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
| 604 | | total_abbr_count = total_abbr_count + 1 |
| 605 | | waiting_for_end_string = 1 |
| 606 | | continue |
| 607 | | |
| 608 | | if comment_rex.search(line): |
| 609 | | waiting_for_end_string = 1 |
| 610 | | continue |
| 611 | | |
| 612 | | if preamble_rex.search(line): |
| 613 | | waiting_for_end_string = 1 |
| 614 | | continue |
| 615 | | |
| 616 | | |
| 617 | | # replace subsequent abbreviations with the value |
| 618 | | abbr_count = 0 |
| 619 | | |
| 620 | | for x in abbr_list: |
| 621 | | |
| 622 | | if abbr_rex[abbr_count].search(line): |
| 623 | | if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: |
| 624 | | line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) |
| 625 | | # Check for # concatenations |
| 626 | | if concatsplit_rex.search(line): |
| 627 | | line = concat_line(line) |
| 628 | | abbr_count = abbr_count + 1 |
| 629 | | |
| 630 | | |
| 631 | | filecont2 = filecont2 + line + '\n' |
| 632 | | i = i+1 |
| 633 | | |
| 634 | | |
| 635 | | # Do one final pass over file |
| 636 | | |
| 637 | | # make sure that didn't end up with {" or }" after the substitution |
| 638 | | filecont2 = filecont2.replace('{"','{{') |
| 639 | | filecont2 = filecont2.replace('"}','}}') |
| 640 | | |
| 641 | | afterquotevalue_rex = re.compile('"\s*,\s*') |
| 642 | | afterbrace_rex = re.compile('"\s*}') |
| 643 | | afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') |
| 644 | | |
| 645 | | # add new lines to data that changed because of abbreviation substitutions |
| 646 | | filecont2 = afterquotevalue_rex.sub('",\n', filecont2) |
| 647 | | filecont2 = afterbrace_rex.sub('"\n}', filecont2) |
| 648 | | filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) |
| 649 | | |
| 650 | | return filecont2 |
| 651 | | |
| 652 | | # |
| 653 | | # convert @type( ... ) to @type{ ... } |
| 654 | | # |
| 655 | | def no_outer_parens(filecont): |
| 656 | | |
| 657 | | # do checking for open parens |
| 658 | | # will convert to braces |
| 659 | | paren_split = re.split('([(){}])',filecont) |
| 660 | | |
| 661 | | open_paren_count = 0 |
| 662 | | open_type = 0 |
| 663 | | look_next = 0 |
| 664 | | |
| 665 | | # rebuild filecont |
| 666 | | filecont = '' |
| 667 | | |
| 668 | | at_rex = re.compile('@\w*') |
| 669 | | |
| 670 | | for phrase in paren_split: |
| 671 | | if look_next == 1: |
| 672 | | if phrase == '(': |
| 673 | | phrase = '{' |
| 674 | | open_paren_count = open_paren_count + 1 |
| 675 | | else: |
| 676 | | open_type = 0 |
| 677 | | look_next = 0 |
| 678 | | |
| 679 | | if phrase == '(': |
| 680 | | open_paren_count = open_paren_count + 1 |
| 681 | | |
| 682 | | elif phrase == ')': |
| 683 | | open_paren_count = open_paren_count - 1 |
| 684 | | if open_type == 1 and open_paren_count == 0: |
| 685 | | phrase = '}' |
| 686 | | open_type = 0 |
| 687 | | |
| 688 | | elif at_rex.search( phrase ): |
| 689 | | open_type = 1 |
| 690 | | look_next = 1 |
| 691 | | |
| 692 | | filecont = filecont + phrase |
| 693 | | |
| 694 | | return filecont |
| 695 | | |
| 696 | | |
| 697 | | # |
| 698 | | # make all whitespace into just one space |
| 699 | | # format the bibtex file into a usable form. |
| 700 | | # |
| 701 | | def bibtexwasher(filecont_source): |
| 702 | | |
| 703 | | space_rex = re.compile('\s+') |
| 704 | | comment_rex = re.compile('\s*%') |
| 705 | | |
| 706 | | filecont = [] |
| 707 | | |
| 708 | | # remove trailing and excessive whitespace |
| 709 | | # ignore comments |
| 710 | | for line in filecont_source: |
| 711 | | line = string.strip(line) |
| 712 | | line = space_rex.sub(' ', line) |
| 713 | | # ignore comments |
| 714 | | if not comment_rex.match(line) and line != '': |
| 715 | | filecont.append(' '+ line) |
| 716 | | |
| 717 | | filecont = string.join(filecont, '') |
| 718 | | |
| 719 | | # the file is in one long string |
| 720 | | |
| 721 | | filecont = no_outer_parens(filecont) |
| 722 | | |
| 723 | | # |
| 724 | | # split lines according to preferred syntax scheme |
| 725 | | # |
| 726 | | filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) |
| 727 | | |
| 728 | | # add new lines after commas that are after values |
| 729 | | filecont = re.sub('"\s*,', '",\n', filecont) |
| 730 | | filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) |
| 731 | | filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', |
| 732 | | '\n\n\g<1>\g<2>,\n', filecont) |
| 733 | | |
| 734 | | # add new lines after } |
| 735 | | filecont = re.sub('"\s*}','"\n}\n', filecont) |
| 736 | | filecont = re.sub('}\s*,','},\n', filecont) |
| 737 | | |
| 738 | | |
| 739 | | filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) |
| 740 | | |
| 741 | | # character encoding, reserved latex characters |
| 742 | | filecont = re.sub('{\\\&}', '&', filecont) |
| 743 | | filecont = re.sub('\\\&', '&', filecont) |
| 744 | | |
| 745 | | # do checking for open braces to get format correct |
| 746 | | open_brace_count = 0 |
| 747 | | brace_split = re.split('([{}])',filecont) |
| 748 | | |
| 749 | | # rebuild filecont |
| 750 | | filecont = '' |
| 751 | | |
| 752 | | for phrase in brace_split: |
| 753 | | if phrase == '{': |
| 754 | | open_brace_count = open_brace_count + 1 |
| 755 | | elif phrase == '}': |
| 756 | | open_brace_count = open_brace_count - 1 |
| 757 | | if open_brace_count == 0: |
| 758 | | filecont = filecont + '\n' |
| 759 | | |
| 760 | | filecont = filecont + phrase |
| 761 | | |
| 762 | | filecont2 = bibtex_replace_abbreviations(filecont) |
| 763 | | |
| 764 | | # gather |
| 765 | | filecont = filecont2.splitlines() |
| 766 | | i=0 |
| 767 | | j=0 # count the number of blank lines |
| 768 | | for line in filecont: |
| 769 | | # ignore blank lines |
| 770 | | if line == '' or line == ' ': |
| 771 | | j = j+1 |
| 772 | | continue |
| 773 | | filecont[i] = line + '\n' |
| 774 | | i = i+1 |
| 775 | | |
| 776 | | # get rid of the extra stuff at the end of the array |
| 777 | | # (The extra stuff are duplicates that are in the array because |
| 778 | | # blank lines were removed.) |
| 779 | | length = len( filecont) |
| 780 | | filecont[length-j:length] = [] |
| 781 | | |
| 782 | | return filecont |
| 783 | | |
| 784 | | |
| 785 | | def filehandler(filepath): |
| 786 | | try: |
| 787 | | fd = open(filepath, 'r') |
| 788 | | filecont_source = fd.readlines() |
| 789 | | fd.close() |
| 790 | | except: |
| 791 | | print 'Could not open file:', filepath |
| 792 | | washeddata = bibtexwasher(filecont_source) |
| 793 | | outdata = bibtexdecoder(washeddata) |
| 794 | | print '/**' |
| 795 | | print '\page references References' |
| 796 | | print |
| 797 | | for line in outdata: |
| 798 | | print line |
| 799 | | print '*/' |
| 800 | | |
| 801 | | |
| 802 | | # main program |
| 803 | | |
| 804 | | def main(): |
| 805 | | import sys |
| 806 | | if sys.argv[1:]: |
| 807 | | filepath = sys.argv[1] |
| 808 | | else: |
| 809 | | print "No input file" |
| 810 | | sys.exit() |
| 811 | | filehandler(filepath) |
| 812 | | |
| 813 | | if __name__ == "__main__": main() |
| 814 | | |
| 815 | | |
| 816 | | # end python script |