Contents

import pandas as pd
import re
from tqdm import tqdm
import pickle
import copy
tqdm.pandas()
# https://huggingface.co/datasets/heegyu/namuwiki/tree/main
df = pd.read_parquet('data/namuwiki_20210301.parquet')
print(df.head())
        title                                               text  \
0           !                                    #redirect λŠλ‚Œν‘œ\n   
1      !!μ•„μ•—!!  \n[λͺ©μ°¨]\n\n'''{{{+1 !!ああっと!!}}}'''\n\n== κ°œμš” ==\...   
2        β€œβ€¦β€¦β€  ||<-2><tablebordercolor=#878787><tablealign=ri...   
3           #  [[λΆ„λ₯˜:특수 문자]]\n[include(ν‹€:λ‹€λ₯Έ 뜻1, other1=μŒμ•…μ—μ„œ μ‚¬μš©...   
4  #FairyJoke  [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#FairyJoke]] λ˜λŠ” [[#F...   

                                        contributors namespace  
0                                r:hoon12560,namubot            
1  110.46.34.123,kirby10,max0243,218.54.117.149,r...            
2  milkbutter,116.122.29.64,211.54.69.240,180.68....            
3  topenkun,125.186.248.118,benquan1812,180.231.1...            
4  r:db,lotot,swontrdg,58.224.144.55,guylian,39.1...            
df = df[['title', 'text']]
df
title text
0 ! #redirect λŠλ‚Œν‘œ\n
1 !!μ•„μ•—!! \n[λͺ©μ°¨]\n\n'''{{{+1 !!ああっと!!}}}'''\n\n== κ°œμš” ==\...
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri...
3 # [[λΆ„λ₯˜:특수 문자]]\n[include(ν‹€:λ‹€λ₯Έ 뜻1, other1=μŒμ•…μ—μ„œ μ‚¬μš©...
4 #FairyJoke [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#FairyJoke]] λ˜λŠ” [[#F...
... ... ...
867019 κ΄‘μ €μš° 푸리 #redirect κ΄‘μ €μš° μ‹œν‹°\n
867020 μ—μŠ€μ‹œλ” #redirect μ΄μŠ€μ‹œ\n
867021 μΉ­λ‹€μ˜€ ν™©ν•˜μ΄ #redirect μΉ­λ‹€μ˜€ FC\n
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...
867023 ν—ˆλ‚œ 젠예 #redirect ν—ˆλ‚œ μ‘Ήμ‚° λ£½λ¨Ό\n

867024 rows Γ— 2 columns

# ν…Œμ΄λΈ”μ΄ μžˆλŠ” 경우만 골라내기
filtered_df = df[df['text'].str.contains('<table')]
filtered_df
title text
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri...
6 #Fairy_dancing_in_lake [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#Fairy_dancing_in_la...
14 &(μ‹±κΈ€) [include(ν‹€:ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έμ˜ μ‹±κΈ€)]\n||||||<tablewidth=1...
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° [include(ν‹€:λ‹€λ₯Έ 뜻1, other1=ν•­μ„±, rd1=λ² ν…”κ²Œμš°μŠ€)]\n[inc...
19 μ‚¬ν…ŒλΌ(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) [include(ν‹€:ν† λ‘  ν•©μ˜, ν† λ‘ μ£Όμ†Œ1=ANullAndFascinatedShoe...
... ... ...
867012 마λ₯Έ λ‚¨μž ||<table width=300><table bordercolor=#000><ta...
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) [[λΆ„λ₯˜:λ‘œμ–„ 크라운]]\n[include(ν‹€:μƒμœ„ λ¬Έμ„œ, top1=λ‘œμ–„ 크라운)]...
867015 ν˜œμ„ μ˜Ήμ£Ό [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ ||<table width=100%><width=58%><tablebordercol...
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...

283385 rows Γ— 2 columns

# λͺ©μ°¨κ°€ λ“€μ–΄κ°€ μžˆλŠ”κ²Œ λ‚΄μš©μ΄ μ–΄λŠμ •λ„ μžˆλŠ” νŽ˜μ΄μ§€μ–΄μ„œ 이 κΈ°μ€€μœΌλ‘œ 필터링함
filtered_df = filtered_df[filtered_df['text'].str.contains('[λͺ©μ°¨]')]
filtered_df
title text
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri...
6 #Fairy_dancing_in_lake [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#Fairy_dancing_in_la...
14 &(μ‹±κΈ€) [include(ν‹€:ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έμ˜ μ‹±κΈ€)]\n||||||<tablewidth=1...
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° [include(ν‹€:λ‹€λ₯Έ 뜻1, other1=ν•­μ„±, rd1=λ² ν…”κ²Œμš°μŠ€)]\n[inc...
19 μ‚¬ν…ŒλΌ(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) [include(ν‹€:ν† λ‘  ν•©μ˜, ν† λ‘ μ£Όμ†Œ1=ANullAndFascinatedShoe...
... ... ...
867012 마λ₯Έ λ‚¨μž ||<table width=300><table bordercolor=#000><ta...
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) [[λΆ„λ₯˜:λ‘œμ–„ 크라운]]\n[include(ν‹€:μƒμœ„ λ¬Έμ„œ, top1=λ‘œμ–„ 크라운)]...
867015 ν˜œμ„ μ˜Ήμ£Ό [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ ||<table width=100%><width=58%><tablebordercol...
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...

274635 rows Γ— 2 columns

# κ°€μž₯ μ•žμ— μžˆλŠ” ν…Œμ΄λΈ”μ΄ 정보 ν…Œμ΄λΈ”μΈ κ²½μš°κ°€ λŒ€λΆ€λΆ„μ΄λ‹€. 
filtered_df['text'] = filtered_df['text'].apply(lambda x: x.split('[λͺ©μ°¨]')[0])
filtered_df
<ipython-input-12-8d5787fbdacd>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['text'] = filtered_df['text'].apply(lambda x: x.split('[λͺ©μ°¨]')[0])
title text
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri...
6 #Fairy_dancing_in_lake [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#Fairy_dancing_in_la...
14 &(μ‹±κΈ€) [include(ν‹€:ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έμ˜ μ‹±κΈ€)]\n||||||<tablewidth=1...
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° [include(ν‹€:λ‹€λ₯Έ 뜻1, other1=ν•­μ„±, rd1=λ² ν…”κ²Œμš°μŠ€)]\n[inc...
19 μ‚¬ν…ŒλΌ(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) [include(ν‹€:ν† λ‘  ν•©μ˜, ν† λ‘ μ£Όμ†Œ1=ANullAndFascinatedShoe...
... ... ...
867012 마λ₯Έ λ‚¨μž ||<table width=300><table bordercolor=#000><ta...
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) [[λΆ„λ₯˜:λ‘œμ–„ 크라운]]\n[include(ν‹€:μƒμœ„ λ¬Έμ„œ, top1=λ‘œμ–„ 크라운)]...
867015 ν˜œμ„ μ˜Ήμ£Ό [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ ||<table width=100%><width=58%><tablebordercol...
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n...

274635 rows Γ— 2 columns

def get_table_contents(text):
    table_contents = []
    regex = r"\|\|.*\|\|\n"
    matches = re.findall(regex, text)
    if matches:
        for match in matches:
            table_contents.append(match)
    return table_contents
filtered_df['table_contents'] = filtered_df['text'].progress_apply(lambda x: get_table_contents(x))
filtered_df
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 274635/274635 [00:03<00:00, 86791.68it/s]
<ipython-input-223-c12b80ec17d1>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['table_contents'] = filtered_df['text'].progress_apply(lambda x: get_table_contents(x))
title text table_contents
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri... [||<-2><tablebordercolor=#878787><tablealign=r...
6 #Fairy_dancing_in_lake [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#Fairy_dancing_in_la... []
14 &(μ‹±κΈ€) [include(ν‹€:ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έμ˜ μ‹±κΈ€)]\n||||||<tablewidth=1... [||||||<tablewidth=100%> '''[[ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έ]]의 μ‹±κΈ€''...
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° [include(ν‹€:λ‹€λ₯Έ 뜻1, other1=ν•­μ„±, rd1=λ² ν…”κ²Œμš°μŠ€)]\n[inc... [||<tablealign=center><bgcolor=#ffffff> [[파일:페...
19 μ‚¬ν…ŒλΌ(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) [include(ν‹€:ν† λ‘  ν•©μ˜, ν† λ‘ μ£Όμ†Œ1=ANullAndFascinatedShoe... [||<-2><bgcolor=#414a4c> {{{#white μ• λ‹ˆλ©”μ΄μ…˜}}}[* ...
... ... ... ...
867012 마λ₯Έ λ‚¨μž ||<table width=300><table bordercolor=#000><ta... [||<table width=300><table bordercolor=#000><t...
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) [[λΆ„λ₯˜:λ‘œμ–„ 크라운]]\n[include(ν‹€:μƒμœ„ λ¬Έμ„œ, top1=λ‘œμ–„ 크라운)]... [||<tablealign=center><tablewidth=70%><-2><bgc...
867015 ν˜œμ„ μ˜Ήμ£Ό [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n... [||<-2><tablealign=right><tablewidth=400><tabl...
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ ||<table width=100%><width=58%><tablebordercol... [||<table width=100%><width=58%><tableborderco...
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n... [||<-2><tablealign=right><tablewidth=400><tabl...

274635 rows Γ— 3 columns

def parse_table_contents(table_contents):
    parsed_table_contents = []
    for content in table_contents:
        content = content.strip().strip('||')
        if '<table' in content:
            continue
        splited_content = content.split('||')
        if len(splited_content) == 1:
            continue
        if len(splited_content) > 2:
            splited_content = splited_content[-2:] 
        predicate, objects = splited_content
        parsed_table_contents.append({'predicate': predicate, 'objects': objects})
    return parsed_table_contents
filtered_df['parsed_table_contents'] = filtered_df['table_contents'].progress_apply(lambda x: parse_table_contents(x))
filtered_df
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 274635/274635 [00:27<00:00, 9872.27it/s] 
<ipython-input-230-a5ab3ba10b00>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['parsed_table_contents'] = filtered_df['table_contents'].progress_apply(lambda x: parse_table_contents(x))
title text table_contents parsed_table_contents
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri... [||<-2><tablebordercolor=#878787><tablealign=r... [{'predicate': ' '''μ½”λ„ˆλͺ…''' ', 'objects': ' β€œβ€¦β€¦...
6 #Fairy_dancing_in_lake [include(ν‹€:λ§ν¬μ‹œ 주의, 링크=[[\\#Fairy_dancing_in_la... [] []
14 &(μ‹±κΈ€) [include(ν‹€:ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έμ˜ μ‹±κΈ€)]\n||||||<tablewidth=1... [||||||<tablewidth=100%> '''[[ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έ]]의 μ‹±κΈ€''... [{'predicate': '<width=40%> '''&''' (2003) ', ...
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° [include(ν‹€:λ‹€λ₯Έ 뜻1, other1=ν•­μ„±, rd1=λ² ν…”κ²Œμš°μŠ€)]\n[inc... [||<tablealign=center><bgcolor=#ffffff> [[파일:페... [{'predicate': '<bgcolor=#007f66> {{{#ffffff '...
19 μ‚¬ν…ŒλΌ(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) [include(ν‹€:ν† λ‘  ν•©μ˜, ν† λ‘ μ£Όμ†Œ1=ANullAndFascinatedShoe... [||<-2><bgcolor=#414a4c> {{{#white μ• λ‹ˆλ©”μ΄μ…˜}}}[* ... []
... ... ... ... ...
867012 마λ₯Έ λ‚¨μž ||<table width=300><table bordercolor=#000><ta... [||<table width=300><table bordercolor=#000><t... []
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) [[λΆ„λ₯˜:λ‘œμ–„ 크라운]]\n[include(ν‹€:μƒμœ„ λ¬Έμ„œ, top1=λ‘œμ–„ 크라운)]... [||<tablealign=center><tablewidth=70%><-2><bgc... [{'predicate': ' {{{#000,#fff '''클래슀'''}}} ', ...
867015 ν˜œμ„ μ˜Ήμ£Ό [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n... [||<-2><tablealign=right><tablewidth=400><tabl... [{'predicate': '<width=50> '''{{{#gold λ³Έλͺ…}}}''...
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ ||<table width=100%><width=58%><tablebordercol... [||<table width=100%><width=58%><tableborderco... [{'predicate': '<rowbgcolor=#ffff99> [[μ „μƒν–ˆλ”λ‹ˆ 슬...
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n... [||<-2><tablealign=right><tablewidth=400><tabl... [{'predicate': '<width=50> '''{{{#gold μΆœμƒ}}}''...

274635 rows Γ— 4 columns

# νŒŒμ‹±λœ κ²°κ³Όκ°€ 0 이상인 κ²ƒλ§Œ 남긴닀. 
cnt_df = filtered_df[['title', 'parsed_table_contents']]
cnt_df['parsed_table_contents'] = cnt_df['parsed_table_contents'].apply(lambda x: len(x))
cnt_df
<ipython-input-232-f18afb6f8856>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnt_df['parsed_table_contents'] = cnt_df['parsed_table_contents'].apply(lambda x: len(x))
title parsed_table_contents
2 β€œβ€¦β€¦β€ 5
6 #Fairy_dancing_in_lake 0
14 &(μ‹±κΈ€) 1
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° 1
19 μ‚¬ν…ŒλΌ(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) 0
... ... ...
867012 마λ₯Έ λ‚¨μž 0
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) 9
867015 ν˜œμ„ μ˜Ήμ£Ό 5
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ 7
867022 순혜옹주 5

274635 rows Γ— 2 columns

filtered_df = filtered_df[filtered_df['title'].isin(cnt_df[cnt_df['parsed_table_contents']!=0]['title'].unique())]
filtered_df
title text table_contents parsed_table_contents
2 β€œβ€¦β€¦β€ ||<-2><tablebordercolor=#878787><tablealign=ri... [||<-2><tablebordercolor=#878787><tablealign=r... [{'predicate': ' '''μ½”λ„ˆλͺ…''' ', 'objects': ' β€œβ€¦β€¦...
14 &(μ‹±κΈ€) [include(ν‹€:ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έμ˜ μ‹±κΈ€)]\n||||||<tablewidth=1... [||||||<tablewidth=100%> '''[[ν•˜λ§ˆμ‚¬ν‚€ μ•„μœ λ―Έ]]의 μ‹±κΈ€''... [{'predicate': '<width=40%> '''&''' (2003) ', ...
18 νŽ˜ν…”κΈ°μš°μŠ€ λ‘œλ§ˆλ„€μ½©ν‹° [include(ν‹€:λ‹€λ₯Έ 뜻1, other1=ν•­μ„±, rd1=λ² ν…”κ²Œμš°μŠ€)]\n[inc... [||<tablealign=center><bgcolor=#ffffff> [[파일:페... [{'predicate': '<bgcolor=#007f66> {{{#ffffff '...
21 레꡴루슀 μ½”λ₯΄λ‹ˆμ•„μŠ€ [include(ν‹€:λŒ€μ£„μ£Όκ΅(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ))]\n\n||<... [||<bgcolor=#c0c0c0> {{{#ffffff '''라이트 노벨 μ„€μ •''... [{'predicate': '<bgcolor=#c0c0c0> {{{#ffffff '...
22 μ—ν‚€λ“œλ‚˜(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ” 이세계 μƒν™œ) [include(ν‹€:νšŒμ›μˆ˜μ •)]\n[include(ν‹€:λ§ˆλ…€(Re: μ œλ‘œλΆ€ν„° μ‹œμž‘ν•˜λŠ”... [||<width=57%><bgcolor=#ffffff> [[파일:μ—ν‚€λ“œλ‚˜ 라노벨 ... [{'predicate': '<width=57%><bgcolor=#ffffff> [...
... ... ... ... ...
867005 λΆˆλŸ‰λ°°λ“€(리틀 λ‚˜μ΄νŠΈλ©”μ–΄ μ‹œλ¦¬μ¦ˆ) ||<-2><table width=300><table bordercolor=#000... [||<-2><table width=300><table bordercolor=#00... [{'predicate': '<width=50%> [[파일:Male_Bully.pn...
867014 λ‹ˆλ‚˜(λ‘œμ–„ 크라운) [[λΆ„λ₯˜:λ‘œμ–„ 크라운]]\n[include(ν‹€:μƒμœ„ λ¬Έμ„œ, top1=λ‘œμ–„ 크라운)]... [||<tablealign=center><tablewidth=70%><-2><bgc... [{'predicate': ' {{{#000,#fff '''클래슀'''}}} ', ...
867015 ν˜œμ„ μ˜Ήμ£Ό [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n... [||<-2><tablealign=right><tablewidth=400><tabl... [{'predicate': '<width=50> '''{{{#gold λ³Έλͺ…}}}''...
867016 μ•„μ¦ˆμ‚¬ μ•„μ΄μžμ™€ ||<table width=100%><width=58%><tablebordercol... [||<table width=100%><width=58%><tableborderco... [{'predicate': '<rowbgcolor=#ffff99> [[μ „μƒν–ˆλ”λ‹ˆ 슬...
867022 순혜옹주 [[λΆ„λ₯˜:μ‘°μ„ μ˜ 후ꢁ]]\n[include(ν‹€:μ‘°μ„ μ˜ 후ꢁ(ν™˜μ‘° ~ μ—°μ‚°κ΅°))]\n... [||<-2><tablealign=right><tablewidth=400><tabl... [{'predicate': '<width=50> '''{{{#gold μΆœμƒ}}}''...

199604 rows Γ— 4 columns

def refine_triple(triple):
    refined_predicate = re.findall(r"'''(.*?)'''", triple['predicate'])
    triple_list = []
    if not refined_predicate:
        return None
    else:
        refined_predicate = refined_predicate[0]
        refined_predicate = refined_predicate.replace('[br]', '')
        refined_predicate = refined_predicate.split('|')[0].strip('[[')
    if '[[' in triple['objects'] and ']]' not in triple['objects']:
        triple['objects'] = triple['objects'] + ']]'
    refined_objects = re.findall(r'\[\[(.*?)\]\]', triple['objects'])
    for obj in refined_objects:
        triple_list.append({'predicate': refined_predicate, 'objects': obj})
    return triple_list

def get_refine_triple(triples):
    triple_list = []
    for triple in triples:
        refined_triple = refine_triple(triple)
        if refined_triple:
            for rt in refined_triple:
                if rt['objects']:
                    triple_list.append(rt)
    if len(triple_list) == 0:
        return None
    return triple_list

def find_korean_text(x):
    korean_pattern = re.compile(r"[κ°€-힣]+")
    korean_matches = korean_pattern.findall(x)
    if len(korean_matches) == 0:
        return None
    return korean_matches[0]
filtered_df['refined_triple'] = filtered_df['parsed_table_contents'].progress_apply(lambda x: get_refine_triple(x))
triple_df = filtered_df[['title', 'refined_triple']].dropna()
triple_df = triple_df.explode('refined_triple')
triple_df['predicate'] = triple_df['refined_triple'].apply(lambda x: x['predicate'])
triple_df['object'] = triple_df['refined_triple'].apply(lambda x: x['objects'])
triple_df = triple_df[['title', 'predicate', 'object']]
triple_df['predicate'] = triple_df['predicate'].progress_apply(lambda x: find_korean_text(x))
triple_df = triple_df.dropna()
predicate_cnt_df = triple_df.groupby('predicate')['object'].count().reset_index()
filtered_predicates = predicate_cnt_df[predicate_cnt_df['object']>=200]['predicate'].unique()
filtered_triple_df = triple_df[triple_df['predicate'].isin(filtered_predicates)]
filtered_triple_df
filtered_triple_object_refined = copy.deepcopy(filtered_triple_df)
filtered_triple_object_refined['object'] = filtered_triple_object_refined['object'].apply(lambda x: x.split('|')[0].strip())
# 였λ₯˜κ°€ λ§Žμ•„λ³΄μ΄λŠ” predicate μ œμ™Έ
exclude_predicates = ['κ°€κ³ μ‹œλ§ˆ', '가쑱관계', '닛포', '인간관계', 'λΆ€λͺ¨', 'κ°€μ‘±', 'ν˜•μ œμžλ§€']
excloude_objects - ['아버지', 'μ–΄λ¨Έλ‹ˆ', 'λˆ„λ‚˜', 'ν˜•', '동생' '동생', '남동생']
filtered_triple_object_refined = filtered_triple_object_refined[~filtered_triple_object_refined['predicate'].isin(exclude_predicates)]
filtered_triple_object_refined = filtered_triple_object_refined[~filtered_triple_object_refined['object'].isin(exclude_objects)
filtered_triple_object_refined['title'] = filtered_triple_object_refined['title'].apply(lambda x: f'https://namu.wiki/w/{x}')
filtered_triple_object_refined['object'] = filtered_triple_object_refined['object'].apply(lambda x: f'https://namu.wiki/w/{x}')
filtered_triple_object_refined['predicate'] = filtered_triple_object_refined['predicate'].apply(lambda x: f'predicate/{x}')
filtered_triple_object_refined.columns = ['subject', 'predicate', 'object']
filtered_triple_object_refined
# graphDB에 올리기 μœ„ν•΄ rdf κ·œμΉ™μ— 맞게 μˆ˜μ •. 각쒅 였λ₯˜ μΌμœΌν‚€λŠ” 원인 제거

triple = copy.deepcopy(filtered_triple_object_refined)
triple['subject'] = triple['subject'].apply(lambda x:f"<{x.replace('<', '').replace('>', '')}>")
triple['predicate'] = triple['predicate'].apply(lambda x:f"<https://{x.replace('<', '').replace('>', '')}>")
triple['object'] = triple['object'].apply(lambda x:f'<{x}>')

triple['subject'] = triple['subject'].apply(lambda x: '_'.join(x.split()))
triple['predicate'] = triple['predicate'].apply(lambda x: '_'.join(x.split()))
triple['object'] = triple['object'].apply(lambda x: '_'.join(x.split()))


triple = triple[~triple['subject'].str.contains('%|#|\^|\{|\}|`|\|')]
triple = triple[~triple['object'].str.contains('%|#|\^|\{|\}|`|\|')]
triple = triple[~triple['object'].str.contains('.jpg')]
triple = triple[~triple['object'].str.contains('.svg')]
triple = triple[~triple['object'].str.contains('.png')]
triple = triple[~triple['object'].str.contains('파일:')]
triple['subject'] = triple['subject'].apply(lambda x: x.replace('\\', '').replace('"', '').replace(']', '').replace('[', ''))
triple['object'] = triple['object'].apply(lambda x: x.replace('\\', '').replace('"', '').replace(']', '').replace('[', ''))
triple = triple[~triple['subject'].str.contains('RΓ©alta_Nua')]
triple = triple[~triple['subject'].str.contains('/Fate/')]
triple['end'] = '.'