| | 1 | #!/usr/bin/env /usr/local/Python/bin/python2.1 |
| | 2 | """ |
| | 3 | Decoder for bibliographic data, BibTeX |
| | 4 | Usage: python bibtex2xml.py bibfile.bib > bibfile.xml |
| | 5 | |
| | 6 | v.8 |
| | 7 | (c)2002-06-23 Vidar Bronken Gundersen |
| | 8 | http://bibtexml.sf.net/ |
| | 9 | Reuse approved as long as this notification is kept. |
| | 10 | Licence: GPL. |
| | 11 | |
| | 12 | Contributions/thanks to: |
| | 13 | Egon Willighagen, http://sf.net/projects/jreferences/ |
| | 14 | Richard Mahoney (for providing a test case) |
| | 15 | |
| | 16 | Editted by Sara Sprenkle to be more robust and handle more bibtex features. (c) 2003-01-15 |
| | 17 | 1. Changed bibtex: tags to bibxml: tags. |
| | 18 | 2. Use xmlns:bibxml="http://bibtexml.sf.net/" |
| | 19 | 3. Allow spaces between @type and first { |
| | 20 | 4. "author" fields with multiple authors split by " and " |
| | 21 | are put in separate xml "bibxml:author" tags. |
| | 22 | 5. Option for Titles: words are capitalized |
| | 23 | only if first letter in title or capitalized inside braces |
| | 24 | 6. Removes braces from within field values |
| | 25 | 7. Ignores comments in bibtex file (including @comment{ or % ) |
| | 26 | 8. Replaces some special latex tags, e.g., replaces ~ with ' ' |
| | 27 | 9. Handles bibtex @string abbreviations |
| | 28 | --> includes bibtex's default abbreviations for months |
| | 29 | --> does concatenation of abbr # " more " and " more " # abbr |
| | 30 | 10. Handles @type( ... ) or @type{ ... } |
| | 31 | 11. The keywords field is split on , or ; and put into separate xml |
| | 32 | "bibxml:keywords" tags |
| | 33 | 12. Ignores @preamble |
| | 34 | |
| | 35 | Known Limitations |
| | 36 | 1. Does not transform Latex encoding like math mode and special latex symbols. |
| | 37 | 2. Does not parse author fields into first and last names. |
| | 38 | E.g., It does not do anything special to an author whose name is in the form LAST_NAME, FIRST_NAME |
| | 39 | In "author" tag, will show up as <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author> |
| | 40 | 3. Does not handle "crossref" fields other than to print <bibxml:crossref>...</bibxml:crossref> |
| | 41 | 4. Does not inform user of the input's format errors. You just won't be able to |
| | 42 | transform the file later with XSL |
| | 43 | |
| | 44 | You will have to manually edit the XML output if you need to handle |
| | 45 | these (and unknown) limitations. |
| | 46 | |
| | 47 | """ |
| | 48 | |
| | 49 | import string, re |
| | 50 | |
| | 51 | # set of valid name characters |
| | 52 | valid_name_chars = '[\w\-:]' |
| | 53 | |
| | 54 | # |
| | 55 | # define global regular expression variables |
| | 56 | # |
| | 57 | author_rex = re.compile('\s+and\s+') |
| | 58 | rembraces_rex = re.compile('[{}]') |
| | 59 | capitalize_rex = re.compile('({\w*})') |
| | 60 | |
| | 61 | # used by bibtexkeywords(data) |
| | 62 | keywords_rex = re.compile('[,;]') |
| | 63 | |
| | 64 | # used by concat_line(line) |
| | 65 | concatsplit_rex = re.compile('\s*#\s*') |
| | 66 | |
| | 67 | # split on {, }, or " in verify_out_of_braces |
| | 68 | delimiter_rex = re.compile('([{}"])',re.I) |
| | 69 | |
| | 70 | field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
| | 71 | data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') |
| | 72 | |
| | 73 | url_rex = re.compile('\\\url\{([^}]*)\}') |
| | 74 | |
| | 75 | |
| | 76 | # |
| | 77 | # return the string parameter without braces |
| | 78 | # |
| | 79 | def transformurls(str): |
| | 80 | return url_rex.sub(r'<a href="\1">\1</a>', str) |
| | 81 | |
| | 82 | # |
| | 83 | # return the string parameter without braces |
| | 84 | # |
| | 85 | def removebraces(str): |
| | 86 | return rembraces_rex.sub('', str) |
| | 87 | |
| | 88 | # |
| | 89 | # latex-specific replacements |
| | 90 | # (do this after braces were removed) |
| | 91 | # |
| | 92 | def latexreplacements(line): |
| | 93 | line = string.replace(line, '~', ' ') |
| | 94 | line = string.replace(line, '\\\'a', 'á') |
| | 95 | line = string.replace(line, '\\"a', 'ä') |
| | 96 | line = string.replace(line, '\\\'e', 'é') |
| | 97 | line = string.replace(line, '\\"e', 'ë') |
| | 98 | line = string.replace(line, '\\\'i', 'í') |
| | 99 | line = string.replace(line, '\\"i', 'ï') |
| | 100 | line = string.replace(line, '\\\'o', 'ó') |
| | 101 | line = string.replace(line, '\\"o', 'ö') |
| | 102 | line = string.replace(line, '\\\'u', 'ú') |
| | 103 | line = string.replace(line, '\\"u', 'ü') |
| | 104 | line = string.replace(line, '\\H o', 'õ') |
| | 105 | line = string.replace(line, '\\H u', 'ü') # ũ does not exist |
| | 106 | line = string.replace(line, '\\\'A', 'Á') |
| | 107 | line = string.replace(line, '\\"A', 'Ä') |
| | 108 | line = string.replace(line, '\\\'E', 'É') |
| | 109 | line = string.replace(line, '\\"E', 'Ë') |
| | 110 | line = string.replace(line, '\\\'I', 'Í') |
| | 111 | line = string.replace(line, '\\"I', 'Ï') |
| | 112 | line = string.replace(line, '\\\'O', 'Ó') |
| | 113 | line = string.replace(line, '\\"O', 'Ö') |
| | 114 | line = string.replace(line, '\\\'U', 'Ú') |
| | 115 | line = string.replace(line, '\\"U', 'Ü') |
| | 116 | line = string.replace(line, '\\H O', 'Õ') |
| | 117 | line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist |
| | 118 | |
| | 119 | return line |
| | 120 | |
| | 121 | # |
| | 122 | # copy characters form a string decoding html expressions (&xyz;) |
| | 123 | # |
| | 124 | def copychars(str, ifrom, count): |
| | 125 | result = '' |
| | 126 | i = ifrom |
| | 127 | c = 0 |
| | 128 | html_spec = False |
| | 129 | while (i < len(str)) and (c < count): |
| | 130 | if str[i] == '&': |
| | 131 | html_spec = True; |
| | 132 | if i+1 < len(str): |
| | 133 | result += str[i+1] |
| | 134 | c += 1 |
| | 135 | i += 2 |
| | 136 | else: |
| | 137 | if not html_spec: |
| | 138 | if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ |
| | 139 | ((str[i] >= 'a') and (str[i] <= 'z')): |
| | 140 | result += str[i] |
| | 141 | c += 1 |
| | 142 | elif str[i] == ';': |
| | 143 | html_spec = False; |
| | 144 | i += 1 |
| | 145 | |
| | 146 | return result |
| | 147 | |
| | 148 | |
| | 149 | # |
| | 150 | # Handle a list of authors (separated by 'and'). |
| | 151 | # It gives back an array of the follwing values: |
| | 152 | # - num: the number of authors, |
| | 153 | # - list: the list of the author names, |
| | 154 | # - text: the bibtex text (separated by commas and/or 'and') |
| | 155 | # - abbrev: abbreviation that can be used for indicate the |
| | 156 | # bibliography entries |
| | 157 | # |
| | 158 | def bibtexauthor(data): |
| | 159 | result = {} |
| | 160 | bibtex = '' |
| | 161 | result['list'] = author_rex.split(data) |
| | 162 | result['num'] = len(result['list']) |
| | 163 | for i, author in enumerate(result['list']): |
| | 164 | # general transformations |
| | 165 | author = latexreplacements(removebraces(author.strip())) |
| | 166 | # transform "Xyz, A. B." to "A. B. Xyz" |
| | 167 | pos = author.find(',') |
| | 168 | if pos != -1: |
| | 169 | author = author[pos+1:].strip() + ' ' + author[:pos].strip() |
| | 170 | result['list'][i] = author |
| | 171 | bibtex += author + '#' |
| | 172 | bibtex = bibtex[:-1] |
| | 173 | if result['num'] > 1: |
| | 174 | ix = bibtex.rfind('#') |
| | 175 | if result['num'] == 2: |
| | 176 | bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] |
| | 177 | else: |
| | 178 | bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] |
| | 179 | bibtex = bibtex.replace('#', ', ') |
| | 180 | result['text'] = bibtex |
| | 181 | |
| | 182 | result['abbrev'] = '' |
| | 183 | for author in result['list']: |
| | 184 | pos = author.rfind(' ') + 1 |
| | 185 | count = 1 |
| | 186 | if result['num'] == 1: |
| | 187 | count = 3 |
| | 188 | result['abbrev'] += copychars(author, pos, count) |
| | 189 | |
| | 190 | return result |
| | 191 | |
| | 192 | |
| | 193 | # |
| | 194 | # data = title string |
| | 195 | # @return the capitalized title (first letter is capitalized), rest are capitalized |
| | 196 | # only if capitalized inside braces |
| | 197 | # |
| | 198 | def capitalizetitle(data): |
| | 199 | title_list = capitalize_rex.split(data) |
| | 200 | title = '' |
| | 201 | count = 0 |
| | 202 | for phrase in title_list: |
| | 203 | check = string.lstrip(phrase) |
| | 204 | |
| | 205 | # keep phrase's capitalization the same |
| | 206 | if check.find('{') == 0: |
| | 207 | title += removebraces(phrase) |
| | 208 | else: |
| | 209 | # first word --> capitalize first letter (after spaces) |
| | 210 | if count == 0: |
| | 211 | title += check.capitalize() |
| | 212 | else: |
| | 213 | title += phrase.lower() |
| | 214 | count = count + 1 |
| | 215 | |
| | 216 | return title |
| | 217 | |
| | 218 | |
| | 219 | # |
| | 220 | # @return the bibtex for the title |
| | 221 | # @param data --> title string |
| | 222 | # braces are removed from title |
| | 223 | # |
| | 224 | def bibtextitle(data, entrytype): |
| | 225 | if entrytype in ('book', 'inbook'): |
| | 226 | title = removebraces(data.strip()) |
| | 227 | else: |
| | 228 | title = removebraces(capitalizetitle(data.strip())) |
| | 229 | bibtex = title |
| | 230 | return bibtex |
| | 231 | |
| | 232 | |
| | 233 | # |
| | 234 | # function to compare entry lists |
| | 235 | # |
| | 236 | def entry_cmp(x, y): |
| | 237 | return cmp(x[0], y[0]) |
| | 238 | |
| | 239 | |
| | 240 | # |
| | 241 | # print the XML for the transformed "filecont_source" |
| | 242 | # |
| | 243 | def bibtexdecoder(filecont_source): |
| | 244 | filecont = [] |
| | 245 | file = [] |
| | 246 | |
| | 247 | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
| | 248 | pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') |
| | 249 | endtype_rex = re.compile('}\s*$') |
| | 250 | endtag_rex = re.compile('^\s*}\s*$') |
| | 251 | |
| | 252 | bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
| | 253 | bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') |
| | 254 | |
| | 255 | quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
| | 256 | quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') |
| | 257 | |
| | 258 | for line in filecont_source: |
| | 259 | line = line[:-1] |
| | 260 | |
| | 261 | # encode character entities |
| | 262 | line = string.replace(line, '&', '&') |
| | 263 | line = string.replace(line, '<', '<') |
| | 264 | line = string.replace(line, '>', '>') |
| | 265 | |
| | 266 | # start entry: publication type (store for later use) |
| | 267 | if pubtype_rex.match(line): |
| | 268 | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
| | 269 | entrycont = {} |
| | 270 | entry = [] |
| | 271 | entrytype = pubtype_rex.sub('\g<1>',line) |
| | 272 | entrytype = string.lower(entrytype) |
| | 273 | # entryid = pubtype_rex.sub('\g<2>', line) |
| | 274 | |
| | 275 | # end entry if just a } |
| | 276 | elif endtype_rex.match(line): |
| | 277 | # generate doxygen code for the entry |
| | 278 | |
| | 279 | # enty type related formattings |
| | 280 | if entrytype in ('book', 'inbook'): |
| | 281 | entrycont['title'] = '<em>' + entrycont['title'] + '</em>' |
| | 282 | if not entrycont.has_key('author'): |
| | 283 | entrycont['author'] = entrycont['editor'] |
| | 284 | entrycont['author']['text'] += ', editors' |
| | 285 | elif entrytype == 'article': |
| | 286 | entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>' |
| | 287 | elif entrytype in ('inproceedings', 'incollection', 'conference'): |
| | 288 | entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>' |
| | 289 | elif entrytype == 'techreport': |
| | 290 | if not entrycont.has_key('type'): |
| | 291 | entrycont['type'] = 'Technical report' |
| | 292 | elif entrytype == 'mastersthesis': |
| | 293 | entrycont['type'] = 'Master\'s thesis' |
| | 294 | elif entrytype == 'phdthesis': |
| | 295 | entrycont['type'] = 'PhD thesis' |
| | 296 | |
| | 297 | for eline in entrycont: |
| | 298 | if eline != '': |
| | 299 | eline = latexreplacements(eline) |
| | 300 | |
| | 301 | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
| | 302 | entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') |
| | 303 | |
| | 304 | if entrycont.has_key('author') and (entrycont['author'] != ''): |
| | 305 | entry.append(entrycont['author']['text'] + '.') |
| | 306 | if entrycont.has_key('title') and (entrycont['title'] != ''): |
| | 307 | entry.append(entrycont['title'] + '.') |
| | 308 | if entrycont.has_key('journal') and (entrycont['journal'] != ''): |
| | 309 | entry.append(entrycont['journal'] + ',') |
| | 310 | if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): |
| | 311 | entry.append('In ' + entrycont['booktitle'] + ',') |
| | 312 | if entrycont.has_key('type') and (entrycont['type'] != ''): |
| | 313 | eline = entrycont['type'] |
| | 314 | if entrycont.has_key('number') and (entrycont['number'] != ''): |
| | 315 | eline += ' ' + entrycont['number'] |
| | 316 | eline += ',' |
| | 317 | entry.append(eline) |
| | 318 | if entrycont.has_key('institution') and (entrycont['institution'] != ''): |
| | 319 | entry.append(entrycont['institution'] + ',') |
| | 320 | if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): |
| | 321 | entry.append(entrycont['publisher'] + ',') |
| | 322 | if entrycont.has_key('school') and (entrycont['school'] != ''): |
| | 323 | entry.append(entrycont['school'] + ',') |
| | 324 | if entrycont.has_key('address') and (entrycont['address'] != ''): |
| | 325 | entry.append(entrycont['address'] + ',') |
| | 326 | if entrycont.has_key('edition') and (entrycont['edition'] != ''): |
| | 327 | entry.append(entrycont['edition'] + ' edition,') |
| | 328 | if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): |
| | 329 | entry.append(entrycont['howpublished'] + ',') |
| | 330 | if entrycont.has_key('volume') and (entrycont['volume'] != ''): |
| | 331 | eline = entrycont['volume']; |
| | 332 | if entrycont.has_key('number') and (entrycont['number'] != ''): |
| | 333 | eline += '(' + entrycont['number'] + ')' |
| | 334 | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
| | 335 | eline += ':' + entrycont['pages'] |
| | 336 | eline += ',' |
| | 337 | entry.append(eline) |
| | 338 | else: |
| | 339 | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
| | 340 | entry.append('pages ' + entrycont['pages'] + ',') |
| | 341 | if entrycont.has_key('year') and (entrycont['year'] != ''): |
| | 342 | if entrycont.has_key('month') and (entrycont['month'] != ''): |
| | 343 | entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') |
| | 344 | else: |
| | 345 | entry.append(entrycont['year'] + '.') |
| | 346 | if entrycont.has_key('note') and (entrycont['note'] != ''): |
| | 347 | entry.append(entrycont['note'] + '.') |
| | 348 | |
| | 349 | # generate keys for sorting and for the output |
| | 350 | sortkey = '' |
| | 351 | bibkey = '' |
| | 352 | if entrycont.has_key('author'): |
| | 353 | for author in entrycont['author']['list']: |
| | 354 | sortkey += copychars(author, author.rfind(' ')+1, len(author)) |
| | 355 | bibkey = entrycont['author']['abbrev'] |
| | 356 | else: |
| | 357 | bibkey = 'x' |
| | 358 | if entrycont.has_key('year'): |
| | 359 | sortkey += entrycont['year'] |
| | 360 | bibkey += entrycont['year'][-2:] |
| | 361 | if entrycont.has_key('title'): |
| | 362 | sortkey += entrycont['title'] |
| | 363 | if entrycont.has_key('key'): |
| | 364 | sortkey = entrycont['key'] + sortkey |
| | 365 | bibkey = entrycont['key'] |
| | 366 | entry.insert(0, sortkey) |
| | 367 | entry.insert(1, bibkey) |
| | 368 | |
| | 369 | # add the entry to the file contents |
| | 370 | filecont.append(entry) |
| | 371 | |
| | 372 | else: |
| | 373 | # field, publication info |
| | 374 | field = '' |
| | 375 | data = '' |
| | 376 | |
| | 377 | # field = {data} entries |
| | 378 | if bracedata_rex.match(line): |
| | 379 | field = bracefield_rex.sub('\g<1>', line) |
| | 380 | field = string.lower(field) |
| | 381 | data = bracedata_rex.sub('\g<2>', line) |
| | 382 | |
| | 383 | # field = "data" entries |
| | 384 | elif quotedata_rex.match(line): |
| | 385 | field = quotefield_rex.sub('\g<1>', line) |
| | 386 | field = string.lower(field) |
| | 387 | data = quotedata_rex.sub('\g<2>', line) |
| | 388 | |
| | 389 | # field = data entries |
| | 390 | elif data_rex.match(line): |
| | 391 | field = field_rex.sub('\g<1>', line) |
| | 392 | field = string.lower(field) |
| | 393 | data = data_rex.sub('\g<2>', line) |
| | 394 | |
| | 395 | if field in ('author', 'editor'): |
| | 396 | entrycont[field] = bibtexauthor(data) |
| | 397 | line = '' |
| | 398 | elif field == 'title': |
| | 399 | line = bibtextitle(data, entrytype) |
| | 400 | elif field != '': |
| | 401 | line = removebraces(transformurls(data.strip())) |
| | 402 | |
| | 403 | if line != '': |
| | 404 | line = latexreplacements(line) |
| | 405 | entrycont[field] = line |
| | 406 | |
| | 407 | |
| | 408 | # sort entries |
| | 409 | filecont.sort(entry_cmp) |
| | 410 | |
| | 411 | # count the bibtex keys |
| | 412 | keytable = {} |
| | 413 | counttable = {} |
| | 414 | for entry in filecont: |
| | 415 | bibkey = entry[1] |
| | 416 | if not keytable.has_key(bibkey): |
| | 417 | keytable[bibkey] = 1 |
| | 418 | else: |
| | 419 | keytable[bibkey] += 1 |
| | 420 | |
| | 421 | for bibkey in keytable.keys(): |
| | 422 | counttable[bibkey] = 0 |
| | 423 | |
| | 424 | # generate output |
| | 425 | for entry in filecont: |
| | 426 | # generate output key form the bibtex key |
| | 427 | bibkey = entry[1] |
| | 428 | if keytable[bibkey] == 1: |
| | 429 | outkey = bibkey |
| | 430 | else: |
| | 431 | outkey = bibkey + chr(97 + counttable[bibkey]) |
| | 432 | counttable[bibkey] += 1 |
| | 433 | |
| | 434 | # append the entry code to the output |
| | 435 | file.append('<tr valign="top">\n' + \ |
| | 436 | '<td>[' + outkey + ']</td>') |
| | 437 | file.append('<td>') |
| | 438 | file.append('\\anchor ' + outkey) |
| | 439 | for line in entry[2:]: |
| | 440 | file.append(line) |
| | 441 | file.append('</td>\n</tr>') |
| | 442 | file.append('') |
| | 443 | |
| | 444 | return file |
| | 445 | |
| | 446 | |
| | 447 | # |
| | 448 | # return 1 iff abbr is in line but not inside braces or quotes |
| | 449 | # assumes that abbr appears only once on the line (out of braces and quotes) |
| | 450 | # |
| | 451 | def verify_out_of_braces(line, abbr): |
| | 452 | |
| | 453 | phrase_split = delimiter_rex.split(line) |
| | 454 | |
| | 455 | abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) |
| | 456 | |
| | 457 | open_brace = 0 |
| | 458 | open_quote = 0 |
| | 459 | |
| | 460 | for phrase in phrase_split: |
| | 461 | if phrase == "{": |
| | 462 | open_brace = open_brace + 1 |
| | 463 | elif phrase == "}": |
| | 464 | open_brace = open_brace - 1 |
| | 465 | elif phrase == '"': |
| | 466 | if open_quote == 1: |
| | 467 | open_quote = 0 |
| | 468 | else: |
| | 469 | open_quote = 1 |
| | 470 | elif abbr_rex.search(phrase): |
| | 471 | if open_brace == 0 and open_quote == 0: |
| | 472 | return 1 |
| | 473 | |
| | 474 | return 0 |
| | 475 | |
| | 476 | |
| | 477 | # |
| | 478 | # a line in the form phrase1 # phrase2 # ... # phrasen |
| | 479 | # is returned as phrase1 phrase2 ... phrasen |
| | 480 | # with the correct punctuation |
| | 481 | # Bug: Doesn't always work with multiple abbreviations plugged in |
| | 482 | # |
| | 483 | def concat_line(line): |
| | 484 | # only look at part after equals |
| | 485 | field = field_rex.sub('\g<1>',line) |
| | 486 | rest = field_rex.sub('\g<2>',line) |
| | 487 | |
| | 488 | concat_line = field + ' =' |
| | 489 | |
| | 490 | pound_split = concatsplit_rex.split(rest) |
| | 491 | |
| | 492 | phrase_count = 0 |
| | 493 | length = len(pound_split) |
| | 494 | |
| | 495 | for phrase in pound_split: |
| | 496 | phrase = phrase.strip() |
| | 497 | if phrase_count != 0: |
| | 498 | if phrase.startswith('"') or phrase.startswith('{'): |
| | 499 | phrase = phrase[1:] |
| | 500 | elif phrase.startswith('"'): |
| | 501 | phrase = phrase.replace('"','{',1) |
| | 502 | |
| | 503 | if phrase_count != length-1: |
| | 504 | if phrase.endswith('"') or phrase.endswith('}'): |
| | 505 | phrase = phrase[:-1] |
| | 506 | else: |
| | 507 | if phrase.endswith('"'): |
| | 508 | phrase = phrase[:-1] |
| | 509 | phrase = phrase + "}" |
| | 510 | elif phrase.endswith('",'): |
| | 511 | phrase = phrase[:-2] |
| | 512 | phrase = phrase + "}," |
| | 513 | |
| | 514 | # if phrase did have \#, add the \# back |
| | 515 | if phrase.endswith('\\'): |
| | 516 | phrase = phrase + "#" |
| | 517 | concat_line = concat_line + ' ' + phrase |
| | 518 | |
| | 519 | phrase_count = phrase_count + 1 |
| | 520 | |
| | 521 | return concat_line |
| | 522 | |
| | 523 | |
| | 524 | # |
| | 525 | # substitute abbreviations into filecont |
| | 526 | # @param filecont_source - string of data from file |
| | 527 | # |
| | 528 | def bibtex_replace_abbreviations(filecont_source): |
| | 529 | filecont = filecont_source.splitlines() |
| | 530 | |
| | 531 | # These are defined in bibtex, so we'll define them too |
| | 532 | abbr_list = ['jan','feb','mar','apr','may','jun', |
| | 533 | 'jul','aug','sep','oct','nov','dec'] |
| | 534 | value_list = ['January','February','March','April', |
| | 535 | 'May','June','July','August','September', |
| | 536 | 'October','November','December'] |
| | 537 | |
| | 538 | abbr_rex = [] |
| | 539 | total_abbr_count = 0 |
| | 540 | |
| | 541 | front = '\\b' |
| | 542 | back = '(,?)\\b' |
| | 543 | |
| | 544 | for x in abbr_list: |
| | 545 | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
| | 546 | total_abbr_count = total_abbr_count + 1 |
| | 547 | |
| | 548 | |
| | 549 | abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', |
| | 550 | re.I) |
| | 551 | |
| | 552 | comment_rex = re.compile('@comment\s*{',re.I) |
| | 553 | preamble_rex = re.compile('@preamble\s*{',re.I) |
| | 554 | |
| | 555 | waiting_for_end_string = 0 |
| | 556 | i = 0 |
| | 557 | filecont2 = '' |
| | 558 | |
| | 559 | for line in filecont: |
| | 560 | if line == ' ' or line == '': |
| | 561 | continue |
| | 562 | |
| | 563 | if waiting_for_end_string: |
| | 564 | if re.search('}',line): |
| | 565 | waiting_for_end_string = 0 |
| | 566 | continue |
| | 567 | |
| | 568 | if abbrdef_rex.search(line): |
| | 569 | abbr = abbrdef_rex.sub('\g<1>', line) |
| | 570 | |
| | 571 | if abbr_list.count(abbr) == 0: |
| | 572 | val = abbrdef_rex.sub('\g<2>', line) |
| | 573 | abbr_list.append(abbr) |
| | 574 | value_list.append(string.strip(val)) |
| | 575 | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
| | 576 | total_abbr_count = total_abbr_count + 1 |
| | 577 | waiting_for_end_string = 1 |
| | 578 | continue |
| | 579 | |
| | 580 | if comment_rex.search(line): |
| | 581 | waiting_for_end_string = 1 |
| | 582 | continue |
| | 583 | |
| | 584 | if preamble_rex.search(line): |
| | 585 | waiting_for_end_string = 1 |
| | 586 | continue |
| | 587 | |
| | 588 | |
| | 589 | # replace subsequent abbreviations with the value |
| | 590 | abbr_count = 0 |
| | 591 | |
| | 592 | for x in abbr_list: |
| | 593 | |
| | 594 | if abbr_rex[abbr_count].search(line): |
| | 595 | if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: |
| | 596 | line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) |
| | 597 | # Check for # concatenations |
| | 598 | if concatsplit_rex.search(line): |
| | 599 | line = concat_line(line) |
| | 600 | abbr_count = abbr_count + 1 |
| | 601 | |
| | 602 | |
| | 603 | filecont2 = filecont2 + line + '\n' |
| | 604 | i = i+1 |
| | 605 | |
| | 606 | |
| | 607 | # Do one final pass over file |
| | 608 | |
| | 609 | # make sure that didn't end up with {" or }" after the substitution |
| | 610 | filecont2 = filecont2.replace('{"','{{') |
| | 611 | filecont2 = filecont2.replace('"}','}}') |
| | 612 | |
| | 613 | afterquotevalue_rex = re.compile('"\s*,\s*') |
| | 614 | afterbrace_rex = re.compile('"\s*}') |
| | 615 | afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') |
| | 616 | |
| | 617 | # add new lines to data that changed because of abbreviation substitutions |
| | 618 | filecont2 = afterquotevalue_rex.sub('",\n', filecont2) |
| | 619 | filecont2 = afterbrace_rex.sub('"\n}', filecont2) |
| | 620 | filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) |
| | 621 | |
| | 622 | return filecont2 |
| | 623 | |
| | 624 | # |
| | 625 | # convert @type( ... ) to @type{ ... } |
| | 626 | # |
| | 627 | def no_outer_parens(filecont): |
| | 628 | |
| | 629 | # do checking for open parens |
| | 630 | # will convert to braces |
| | 631 | paren_split = re.split('([(){}])',filecont) |
| | 632 | |
| | 633 | open_paren_count = 0 |
| | 634 | open_type = 0 |
| | 635 | look_next = 0 |
| | 636 | |
| | 637 | # rebuild filecont |
| | 638 | filecont = '' |
| | 639 | |
| | 640 | at_rex = re.compile('@\w*') |
| | 641 | |
| | 642 | for phrase in paren_split: |
| | 643 | if look_next == 1: |
| | 644 | if phrase == '(': |
| | 645 | phrase = '{' |
| | 646 | open_paren_count = open_paren_count + 1 |
| | 647 | else: |
| | 648 | open_type = 0 |
| | 649 | look_next = 0 |
| | 650 | |
| | 651 | if phrase == '(': |
| | 652 | open_paren_count = open_paren_count + 1 |
| | 653 | |
| | 654 | elif phrase == ')': |
| | 655 | open_paren_count = open_paren_count - 1 |
| | 656 | if open_type == 1 and open_paren_count == 0: |
| | 657 | phrase = '}' |
| | 658 | open_type = 0 |
| | 659 | |
| | 660 | elif at_rex.search( phrase ): |
| | 661 | open_type = 1 |
| | 662 | look_next = 1 |
| | 663 | |
| | 664 | filecont = filecont + phrase |
| | 665 | |
| | 666 | return filecont |
| | 667 | |
| | 668 | |
| | 669 | # |
| | 670 | # make all whitespace into just one space |
| | 671 | # format the bibtex file into a usable form. |
| | 672 | # |
| | 673 | def bibtexwasher(filecont_source): |
| | 674 | |
| | 675 | space_rex = re.compile('\s+') |
| | 676 | comment_rex = re.compile('\s*%') |
| | 677 | |
| | 678 | filecont = [] |
| | 679 | |
| | 680 | # remove trailing and excessive whitespace |
| | 681 | # ignore comments |
| | 682 | for line in filecont_source: |
| | 683 | line = string.strip(line) |
| | 684 | line = space_rex.sub(' ', line) |
| | 685 | # ignore comments |
| | 686 | if not comment_rex.match(line) and line != '': |
| | 687 | filecont.append(' '+ line) |
| | 688 | |
| | 689 | filecont = string.join(filecont, '') |
| | 690 | |
| | 691 | # the file is in one long string |
| | 692 | |
| | 693 | filecont = no_outer_parens(filecont) |
| | 694 | |
| | 695 | # |
| | 696 | # split lines according to preferred syntax scheme |
| | 697 | # |
| | 698 | filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) |
| | 699 | |
| | 700 | # add new lines after commas that are after values |
| | 701 | filecont = re.sub('"\s*,', '",\n', filecont) |
| | 702 | filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) |
| | 703 | filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', |
| | 704 | '\n\n\g<1>\g<2>,\n', filecont) |
| | 705 | |
| | 706 | # add new lines after } |
| | 707 | filecont = re.sub('"\s*}','"\n}\n', filecont) |
| | 708 | filecont = re.sub('}\s*,','},\n', filecont) |
| | 709 | |
| | 710 | |
| | 711 | filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) |
| | 712 | |
| | 713 | # character encoding, reserved latex characters |
| | 714 | filecont = re.sub('{\\\&}', '&', filecont) |
| | 715 | filecont = re.sub('\\\&', '&', filecont) |
| | 716 | |
| | 717 | # do checking for open braces to get format correct |
| | 718 | open_brace_count = 0 |
| | 719 | brace_split = re.split('([{}])',filecont) |
| | 720 | |
| | 721 | # rebuild filecont |
| | 722 | filecont = '' |
| | 723 | |
| | 724 | for phrase in brace_split: |
| | 725 | if phrase == '{': |
| | 726 | open_brace_count = open_brace_count + 1 |
| | 727 | elif phrase == '}': |
| | 728 | open_brace_count = open_brace_count - 1 |
| | 729 | if open_brace_count == 0: |
| | 730 | filecont = filecont + '\n' |
| | 731 | |
| | 732 | filecont = filecont + phrase |
| | 733 | |
| | 734 | filecont2 = bibtex_replace_abbreviations(filecont) |
| | 735 | |
| | 736 | # gather |
| | 737 | filecont = filecont2.splitlines() |
| | 738 | i=0 |
| | 739 | j=0 # count the number of blank lines |
| | 740 | for line in filecont: |
| | 741 | # ignore blank lines |
| | 742 | if line == '' or line == ' ': |
| | 743 | j = j+1 |
| | 744 | continue |
| | 745 | filecont[i] = line + '\n' |
| | 746 | i = i+1 |
| | 747 | |
| | 748 | # get rid of the extra stuff at the end of the array |
| | 749 | # (The extra stuff are duplicates that are in the array because |
| | 750 | # blank lines were removed.) |
| | 751 | length = len( filecont) |
| | 752 | filecont[length-j:length] = [] |
| | 753 | |
| | 754 | return filecont |
| | 755 | |
| | 756 | |
| | 757 | def filehandler(filepath): |
| | 758 | try: |
| | 759 | fd = open(filepath, 'r') |
| | 760 | filecont_source = fd.readlines() |
| | 761 | fd.close() |
| | 762 | except: |
| | 763 | print 'Could not open file:', filepath |
| | 764 | washeddata = bibtexwasher(filecont_source) |
| | 765 | outdata = bibtexdecoder(washeddata) |
| | 766 | print '/**' |
| | 767 | print '\page references References' |
| | 768 | print |
| | 769 | print '<table border="0" cellspacing="5px" width="100%">' |
| | 770 | print |
| | 771 | for line in outdata: |
| | 772 | print line |
| | 773 | print '</table>' |
| | 774 | print |
| | 775 | print '*/' |
| | 776 | |
| | 777 | |
| | 778 | # main program |
| | 779 | |
| | 780 | def main(): |
| | 781 | import sys |
| | 782 | if sys.argv[1:]: |
| | 783 | filepath = sys.argv[1] |
| | 784 | else: |
| | 785 | print "No input file" |
| | 786 | sys.exit() |
| | 787 | filehandler(filepath) |
| | 788 | |
| | 789 | if __name__ == "__main__": main() |
| | 790 | |
| | 791 | |
| | 792 | # end python script |