Implement gradio demo
Browse files- app.py +42 -0
- config.yaml +13 -22
- marcai/process.py +0 -6
- marcai/processing/comparisons.py +0 -22
- model.onnx +3 -0
app.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pymarc
|
| 3 |
+
from marcai.process import process
|
| 4 |
+
from marcai.utils.parsing import record_dict
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from marcai.predict import predict_onnx
|
| 7 |
+
from marcai.utils import load_config
|
| 8 |
+
|
| 9 |
+
def compare(file1, file2):
|
| 10 |
+
record1 = pymarc.parse_xml_to_array(file1)[0]
|
| 11 |
+
record2 = pymarc.parse_xml_to_array(file2)[0]
|
| 12 |
+
|
| 13 |
+
df1 = pd.DataFrame.from_dict([record_dict(record1)])
|
| 14 |
+
df2 = pd.DataFrame.from_dict([record_dict(record2)])
|
| 15 |
+
|
| 16 |
+
df = process(df1, df2)
|
| 17 |
+
|
| 18 |
+
# Load model config
|
| 19 |
+
config = load_config("config.yaml")
|
| 20 |
+
model_onnx = "model.onnx"
|
| 21 |
+
|
| 22 |
+
# Run ONNX model
|
| 23 |
+
input_df = df[config["model"]["features"]]
|
| 24 |
+
prediction = predict_onnx(model_onnx, input_df)
|
| 25 |
+
|
| 26 |
+
prediction = prediction.item()
|
| 27 |
+
|
| 28 |
+
return {"match": prediction, "not match": 1 - prediction}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
interface = gr.Interface(
|
| 32 |
+
fn=compare,
|
| 33 |
+
inputs=[
|
| 34 |
+
gr.File(label="MARC XML File 1"),
|
| 35 |
+
gr.File(label="MARC XML File 2")
|
| 36 |
+
],
|
| 37 |
+
outputs=gr.Label(label="Classification"),
|
| 38 |
+
title="MARC Record Matcher",
|
| 39 |
+
description="Upload two MARC XML files with one record each.",
|
| 40 |
+
allow_flagging="never"
|
| 41 |
+
)
|
| 42 |
+
interface.launch()
|
config.yaml
CHANGED
|
@@ -1,31 +1,22 @@
|
|
| 1 |
model:
|
| 2 |
-
|
| 3 |
features:
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
# Size of hidden layers
|
| 12 |
hidden_sizes:
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
# Training
|
| 17 |
-
batch_size: 512
|
| 18 |
-
weight_decay: 0.0
|
| 19 |
-
max_epochs: -1
|
| 20 |
-
|
| 21 |
-
# Disable early stopping with -1
|
| 22 |
-
patience: 20
|
| 23 |
-
|
| 24 |
lr: 0.006
|
|
|
|
| 25 |
optimizer: Adam
|
|
|
|
| 26 |
saved_models_dir: saved_models
|
| 27 |
-
|
| 28 |
-
# Paths to dataset splits
|
| 29 |
test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
|
| 30 |
train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
|
| 31 |
val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
|
|
|
|
|
|
| 1 |
model:
|
| 2 |
+
batch_size: 512
|
| 3 |
features:
|
| 4 |
+
- title_tokenset
|
| 5 |
+
- title_agg
|
| 6 |
+
- author
|
| 7 |
+
- publisher
|
| 8 |
+
- pub_date
|
| 9 |
+
- pub_place
|
| 10 |
+
- pagination
|
|
|
|
| 11 |
hidden_sizes:
|
| 12 |
+
- 32
|
| 13 |
+
- 64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
lr: 0.006
|
| 15 |
+
max_epochs: -1
|
| 16 |
optimizer: Adam
|
| 17 |
+
patience: 20
|
| 18 |
saved_models_dir: saved_models
|
|
|
|
|
|
|
| 19 |
test_processed_path: data/202303_goldfinch_set_1.1/processed/test_processed.csv
|
| 20 |
train_processed_path: data/202303_goldfinch_set_1.1/processed/train_processed.csv
|
| 21 |
val_processed_path: data/202303_goldfinch_set_1.1/processed/val_processed.csv
|
| 22 |
+
weight_decay: 0.0
|
marcai/process.py
CHANGED
|
@@ -109,7 +109,6 @@ def process(df0, df1):
|
|
| 109 |
df0["raw"], df1["raw"], null_value=0.5
|
| 110 |
)
|
| 111 |
|
| 112 |
-
|
| 113 |
# Token sort ratio
|
| 114 |
result_df["publisher"] = comps.token_sort_similarity(
|
| 115 |
df0["publisher"], df1["publisher"], null_value=0.5
|
|
@@ -140,11 +139,6 @@ def process(df0, df1):
|
|
| 140 |
df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
|
| 141 |
)
|
| 142 |
|
| 143 |
-
# Phonetic difference
|
| 144 |
-
result_df["title_phonetic"] = comps.phonetic_similarity(
|
| 145 |
-
df0["title"], df1["title"], null_value=0
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
# Length difference
|
| 149 |
result_df["title_length"] = comps.length_similarity(
|
| 150 |
df0["title"], df1["title"], null_value=0.5
|
|
|
|
| 109 |
df0["raw"], df1["raw"], null_value=0.5
|
| 110 |
)
|
| 111 |
|
|
|
|
| 112 |
# Token sort ratio
|
| 113 |
result_df["publisher"] = comps.token_sort_similarity(
|
| 114 |
df0["publisher"], df1["publisher"], null_value=0.5
|
|
|
|
| 139 |
df0[weights.keys()], df1[weights.keys()], weights.values(), null_value=0
|
| 140 |
)
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# Length difference
|
| 143 |
result_df["title_length"] = comps.length_similarity(
|
| 144 |
df0["title"], df1["title"], null_value=0.5
|
marcai/processing/comparisons.py
CHANGED
|
@@ -3,9 +3,6 @@ import re
|
|
| 3 |
import pandas as pd
|
| 4 |
from thefuzz import fuzz
|
| 5 |
import textdistance
|
| 6 |
-
import fuzzy
|
| 7 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 9 |
|
| 10 |
|
| 11 |
|
|
@@ -190,25 +187,6 @@ def length_similarity(se0, se1, null_value):
|
|
| 190 |
|
| 191 |
return pd.Series(col)
|
| 192 |
|
| 193 |
-
def phonetic_similarity(se0, se1, null_value):
|
| 194 |
-
soundex = fuzzy.Soundex(4)
|
| 195 |
-
|
| 196 |
-
se0_np = se0.to_numpy(dtype=str)
|
| 197 |
-
se1_np = se1.to_numpy(dtype=str)
|
| 198 |
-
|
| 199 |
-
def compare_words(str0, str1):
|
| 200 |
-
words0 = str0.split()
|
| 201 |
-
words1 = str1.split()
|
| 202 |
-
|
| 203 |
-
sounds0 = [soundex(word) for word in words0]
|
| 204 |
-
sounds1 = [soundex(word) for word in words1]
|
| 205 |
-
|
| 206 |
-
return sum(s0 == s1 for s0, s1 in zip(sounds0, sounds1)) / max(len(sounds0), len(sounds1))
|
| 207 |
-
|
| 208 |
-
col = np.vectorize(compare_words)(se0_np, se1_np)
|
| 209 |
-
|
| 210 |
-
return pd.Series(col)
|
| 211 |
-
|
| 212 |
|
| 213 |
def jaccard_similarity(se0, se1, null_value):
|
| 214 |
se0_np = se0.to_numpy(dtype=str)
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from thefuzz import fuzz
|
| 5 |
import textdistance
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
|
|
|
|
| 187 |
|
| 188 |
return pd.Series(col)
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
def jaccard_similarity(se0, se1, null_value):
|
| 192 |
se0_np = se0.to_numpy(dtype=str)
|
model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a549a29ebb618819a227d9568e8c1a6555e4f6407c3b4031a9170f4746ecdde
|
| 3 |
+
size 10669
|