gdb-utils

TEI/XMLデータを扱う際のツール群です。

Install

pip install git+https://github.com/nakamura196/gdb-utils.git
from gdb_utils.core import *

使い方

client = TeiUtils()

ダウンロード

url = "https://utda.github.io/goethe/data/xml/goethe.xml"
path = "data/goethe.xml"
client.download(url, path)

分析

client.get_tag_freq(path)

頻度の降順で表示

client.df
Tag Count
8 persName 15
55 lb 13
10 p 9
50 app 8
51 lem 8
... ... ...
28 width 1
29 handDesc 1
1 teiHeader 1
31 history 1
30 handNote 1

61 rows × 2 columns

名前の昇順で表示

client.df_tag
Tag Count
0 TEI 1
18 addrLine 1
17 address 1
50 app 8
5 author 2
... ... ...
4 title 7
3 titleStmt 1
41 variantEncoding 1
28 width 1
13 witness 2

61 rows × 2 columns

RomaでタグをチェックするためのJSスクリプトを生成

print(client.get_javascript())

function checkCheckboxesWithTextValues(textValues) {
    // 存在しなかった要素名を格納する配列
    let notFound = [];

    // 指定されたテキスト値のリストをループ処理
    textValues.forEach(function(textToMatch) {
        // テキストに一致する .mdc-list-item__primary-text 要素を取得
        let found = false;
        document.querySelectorAll('.mdc-list-item__primary-text').forEach(function(item) {
            if (item.textContent.trim() === textToMatch) {
                found = true;
                let checkbox = item.closest('.mdc-list-item').querySelector('.mdc-checkbox__native-control');
                if (checkbox) {
                    checkbox.checked = true;
                }
            }
        });

        // 要素が見つからなければ notFound 配列に追加
        if (!found) {
            notFound.push(textToMatch);
        }
    });

    // 存在しなかった要素名を返す
    return notFound;
}

// 指定したいテキスト値のリスト
const itemsToCheck = ["TEI", "addrLine", "address", "app", "author", "back", "bibl", "body", "closer", "correspAction", "correspDesc", "country", "date", "dimensions", "district", "div", "editor", "editorialDecl", "encodingDesc", "extent", "fileDesc", "handDesc", "handNote", "height", "history", "idno", "institution", "lb", "lem", "listPerson", "listWit", "location", "msDesc", "msIdentifier", "objectDesc", "opener", "origin", "p", "persName", "person", "physDesc", "placeName", "profileDesc", "provenance", "publicationStmt", "publisher", "rdg", "resp", "respStmt", "salute", "signed", "sourceDesc", "space", "supportDesc", "teiHeader", "text", "title", "titleStmt", "variantEncoding", "width", "witness"];

// チェックしたい項目のリストを関数に渡し、存在しなかった項目を取得
const itemsNotFound = checkCheckboxesWithTextValues(itemsToCheck);

// 存在しなかった項目をコンソールに出力
if (itemsNotFound.length > 0) {
    console.log('These items were not found:', itemsNotFound);
} else {
    console.log('All items were found and checked.');
}

Splits an XML file into multiple files based on the provided element and attributes.

path = "data/sample.xml"
output = "data/sample/out.xml"
client.split_xml_file(path, output, "pb")
[]
[autoreload of gdb_utils.core failed: Traceback (most recent call last):
  File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 349, in update_class
    if update_generic(old_obj, new_obj):
  File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 309, in update_function
    setattr(old, name, getattr(new, name))
ValueError: startElement() requires a code object with 0 free vars, not 1
]
path = "data/sample.xml"
output = "data/sample2/out.xml"
client.split_xml_file(path, output, "div", split_attr="type", split_value="sample")
[]