Contents
import pandas as pd
import re
from tqdm import tqdm
import pickle
import copy
tqdm.pandas()
# https://huggingface.co/datasets/heegyu/namuwiki/tree/main
df = pd.read_parquet('data/namuwiki_20210301.parquet')
print(df.head())
title text \
0 ! #redirect λλν\n
1 !!μμ!! \n[λͺ©μ°¨]\n\n'''{{{+1 οΌοΌγγγ£γ¨οΌοΌ}}}'''\n\n== κ°μ ==\...
2 ββ¦β¦β ||<-2><tablebordercolor=#878787><tablealign=ri...
3 # [[λΆλ₯:νΉμ λ¬Έμ]]\n[include(ν:λ€λ₯Έ λ»1, other1=μμ
μμ μ¬μ©...
4 #FairyJoke [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#FairyJoke]] λλ [[#F...
contributors namespace
0 r:hoon12560,namubot
1 110.46.34.123,kirby10,max0243,218.54.117.149,r...
2 milkbutter,116.122.29.64,211.54.69.240,180.68....
3 topenkun,125.186.248.118,benquan1812,180.231.1...
4 r:db,lotot,swontrdg,58.224.144.55,guylian,39.1...
df = df[['title', 'text']]
df
title | text | |
---|---|---|
0 | ! | #redirect λλν\n |
1 | !!μμ!! | \n[λͺ©μ°¨]\n\n'''{{{+1 οΌοΌγγγ£γ¨οΌοΌ}}}'''\n\n== κ°μ ==\... |
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... |
3 | # | [[λΆλ₯:νΉμ λ¬Έμ]]\n[include(ν:λ€λ₯Έ λ»1, other1=μμ μμ μ¬μ©... |
4 | #FairyJoke | [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#FairyJoke]] λλ [[#F... |
... | ... | ... |
867019 | κ΄μ μ° νΈλ¦¬ | #redirect κ΄μ μ° μν°\n |
867020 | μμ€μλ | #redirect μ΄μ€μ\n |
867021 | μΉλ€μ€ ν©νμ΄ | #redirect μΉλ€μ€ FC\n |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
867023 | νλ μ μ | #redirect νλ μΉμ° λ£½λ¨Ό\n |
867024 rows Γ 2 columns
# ν
μ΄λΈμ΄ μλ κ²½μ°λ§ 골λΌλ΄κΈ°
filtered_df = df[df['text'].str.contains('<table')]
filtered_df
title | text | |
---|---|---|
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... |
6 | #Fairy_dancing_in_lake | [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#Fairy_dancing_in_la... |
14 | &(μ±κΈ) | [include(ν:νλ§μ¬ν€ μμ λ―Έμ μ±κΈ)]\n||||||<tablewidth=1... |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | [include(ν:λ€λ₯Έ λ»1, other1=νμ±, rd1=λ² ν κ²μ°μ€)]\n[inc... |
19 | μ¬ν λΌ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | [include(ν:ν λ‘ ν©μ, ν λ‘ μ£Όμ1=ANullAndFascinatedShoe... |
... | ... | ... |
867012 | λ§λ₯Έ λ¨μ | ||<table width=300><table bordercolor=#000><ta... |
867014 | λλ(λ‘μ ν¬λΌμ΄) | [[λΆλ₯:λ‘μ ν¬λΌμ΄]]\n[include(ν:μμ λ¬Έμ, top1=λ‘μ ν¬λΌμ΄)]... |
867015 | νμ μΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
867016 | μμ¦μ¬ μμ΄μμ | ||<table width=100%><width=58%><tablebordercol... |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
283385 rows Γ 2 columns
# λͺ©μ°¨κ° λ€μ΄κ° μλκ² λ΄μ©μ΄ μ΄λμ λ μλ νμ΄μ§μ΄μ μ΄ κΈ°μ€μΌλ‘ νν°λ§ν¨
filtered_df = filtered_df[filtered_df['text'].str.contains('[λͺ©μ°¨]')]
filtered_df
title | text | |
---|---|---|
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... |
6 | #Fairy_dancing_in_lake | [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#Fairy_dancing_in_la... |
14 | &(μ±κΈ) | [include(ν:νλ§μ¬ν€ μμ λ―Έμ μ±κΈ)]\n||||||<tablewidth=1... |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | [include(ν:λ€λ₯Έ λ»1, other1=νμ±, rd1=λ² ν κ²μ°μ€)]\n[inc... |
19 | μ¬ν λΌ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | [include(ν:ν λ‘ ν©μ, ν λ‘ μ£Όμ1=ANullAndFascinatedShoe... |
... | ... | ... |
867012 | λ§λ₯Έ λ¨μ | ||<table width=300><table bordercolor=#000><ta... |
867014 | λλ(λ‘μ ν¬λΌμ΄) | [[λΆλ₯:λ‘μ ν¬λΌμ΄]]\n[include(ν:μμ λ¬Έμ, top1=λ‘μ ν¬λΌμ΄)]... |
867015 | νμ μΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
867016 | μμ¦μ¬ μμ΄μμ | ||<table width=100%><width=58%><tablebordercol... |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
274635 rows Γ 2 columns
# κ°μ₯ μμ μλ ν
μ΄λΈμ΄ μ 보 ν
μ΄λΈμΈ κ²½μ°κ° λλΆλΆμ΄λ€.
filtered_df['text'] = filtered_df['text'].apply(lambda x: x.split('[λͺ©μ°¨]')[0])
filtered_df
<ipython-input-12-8d5787fbdacd>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
filtered_df['text'] = filtered_df['text'].apply(lambda x: x.split('[λͺ©μ°¨]')[0])
title | text | |
---|---|---|
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... |
6 | #Fairy_dancing_in_lake | [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#Fairy_dancing_in_la... |
14 | &(μ±κΈ) | [include(ν:νλ§μ¬ν€ μμ λ―Έμ μ±κΈ)]\n||||||<tablewidth=1... |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | [include(ν:λ€λ₯Έ λ»1, other1=νμ±, rd1=λ² ν κ²μ°μ€)]\n[inc... |
19 | μ¬ν λΌ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | [include(ν:ν λ‘ ν©μ, ν λ‘ μ£Όμ1=ANullAndFascinatedShoe... |
... | ... | ... |
867012 | λ§λ₯Έ λ¨μ | ||<table width=300><table bordercolor=#000><ta... |
867014 | λλ(λ‘μ ν¬λΌμ΄) | [[λΆλ₯:λ‘μ ν¬λΌμ΄]]\n[include(ν:μμ λ¬Έμ, top1=λ‘μ ν¬λΌμ΄)]... |
867015 | νμ μΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
867016 | μμ¦μ¬ μμ΄μμ | ||<table width=100%><width=58%><tablebordercol... |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... |
274635 rows Γ 2 columns
def get_table_contents(text):
table_contents = []
regex = r"\|\|.*\|\|\n"
matches = re.findall(regex, text)
if matches:
for match in matches:
table_contents.append(match)
return table_contents
filtered_df['table_contents'] = filtered_df['text'].progress_apply(lambda x: get_table_contents(x))
filtered_df
100%|ββββββββββ| 274635/274635 [00:03<00:00, 86791.68it/s]
<ipython-input-223-c12b80ec17d1>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
filtered_df['table_contents'] = filtered_df['text'].progress_apply(lambda x: get_table_contents(x))
title | text | table_contents | |
---|---|---|---|
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... | [||<-2><tablebordercolor=#878787><tablealign=r... |
6 | #Fairy_dancing_in_lake | [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#Fairy_dancing_in_la... | [] |
14 | &(μ±κΈ) | [include(ν:νλ§μ¬ν€ μμ λ―Έμ μ±κΈ)]\n||||||<tablewidth=1... | [||||||<tablewidth=100%> '''[[νλ§μ¬ν€ μμ λ―Έ]]μ μ±κΈ''... |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | [include(ν:λ€λ₯Έ λ»1, other1=νμ±, rd1=λ² ν κ²μ°μ€)]\n[inc... | [||<tablealign=center><bgcolor=#ffffff> [[νμΌ:ν... |
19 | μ¬ν λΌ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | [include(ν:ν λ‘ ν©μ, ν λ‘ μ£Όμ1=ANullAndFascinatedShoe... | [||<-2><bgcolor=#414a4c> {{{#white μ λλ©μ΄μ }}}[* ... |
... | ... | ... | ... |
867012 | λ§λ₯Έ λ¨μ | ||<table width=300><table bordercolor=#000><ta... | [||<table width=300><table bordercolor=#000><t... |
867014 | λλ(λ‘μ ν¬λΌμ΄) | [[λΆλ₯:λ‘μ ν¬λΌμ΄]]\n[include(ν:μμ λ¬Έμ, top1=λ‘μ ν¬λΌμ΄)]... | [||<tablealign=center><tablewidth=70%><-2><bgc... |
867015 | νμ μΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... | [||<-2><tablealign=right><tablewidth=400><tabl... |
867016 | μμ¦μ¬ μμ΄μμ | ||<table width=100%><width=58%><tablebordercol... | [||<table width=100%><width=58%><tableborderco... |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... | [||<-2><tablealign=right><tablewidth=400><tabl... |
274635 rows Γ 3 columns
def parse_table_contents(table_contents):
parsed_table_contents = []
for content in table_contents:
content = content.strip().strip('||')
if '<table' in content:
continue
splited_content = content.split('||')
if len(splited_content) == 1:
continue
if len(splited_content) > 2:
splited_content = splited_content[-2:]
predicate, objects = splited_content
parsed_table_contents.append({'predicate': predicate, 'objects': objects})
return parsed_table_contents
filtered_df['parsed_table_contents'] = filtered_df['table_contents'].progress_apply(lambda x: parse_table_contents(x))
filtered_df
100%|ββββββββββ| 274635/274635 [00:27<00:00, 9872.27it/s]
<ipython-input-230-a5ab3ba10b00>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
filtered_df['parsed_table_contents'] = filtered_df['table_contents'].progress_apply(lambda x: parse_table_contents(x))
title | text | table_contents | parsed_table_contents | |
---|---|---|---|---|
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... | [||<-2><tablebordercolor=#878787><tablealign=r... | [{'predicate': ' '''μ½λλͺ ''' ', 'objects': ' ββ¦β¦... |
6 | #Fairy_dancing_in_lake | [include(ν:λ§ν¬μ μ£Όμ, λ§ν¬=[[\\#Fairy_dancing_in_la... | [] | [] |
14 | &(μ±κΈ) | [include(ν:νλ§μ¬ν€ μμ λ―Έμ μ±κΈ)]\n||||||<tablewidth=1... | [||||||<tablewidth=100%> '''[[νλ§μ¬ν€ μμ λ―Έ]]μ μ±κΈ''... | [{'predicate': '<width=40%> '''&''' (2003) ', ... |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | [include(ν:λ€λ₯Έ λ»1, other1=νμ±, rd1=λ² ν κ²μ°μ€)]\n[inc... | [||<tablealign=center><bgcolor=#ffffff> [[νμΌ:ν... | [{'predicate': '<bgcolor=#007f66> {{{#ffffff '... |
19 | μ¬ν λΌ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | [include(ν:ν λ‘ ν©μ, ν λ‘ μ£Όμ1=ANullAndFascinatedShoe... | [||<-2><bgcolor=#414a4c> {{{#white μ λλ©μ΄μ }}}[* ... | [] |
... | ... | ... | ... | ... |
867012 | λ§λ₯Έ λ¨μ | ||<table width=300><table bordercolor=#000><ta... | [||<table width=300><table bordercolor=#000><t... | [] |
867014 | λλ(λ‘μ ν¬λΌμ΄) | [[λΆλ₯:λ‘μ ν¬λΌμ΄]]\n[include(ν:μμ λ¬Έμ, top1=λ‘μ ν¬λΌμ΄)]... | [||<tablealign=center><tablewidth=70%><-2><bgc... | [{'predicate': ' {{{#000,#fff '''ν΄λμ€'''}}} ', ... |
867015 | νμ μΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... | [||<-2><tablealign=right><tablewidth=400><tabl... | [{'predicate': '<width=50> '''{{{#gold λ³Έλͺ }}}''... |
867016 | μμ¦μ¬ μμ΄μμ | ||<table width=100%><width=58%><tablebordercol... | [||<table width=100%><width=58%><tableborderco... | [{'predicate': '<rowbgcolor=#ffff99> [[μ μνλλ μ¬... |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... | [||<-2><tablealign=right><tablewidth=400><tabl... | [{'predicate': '<width=50> '''{{{#gold μΆμ}}}''... |
274635 rows Γ 4 columns
# νμ±λ κ²°κ³Όκ° 0 μ΄μμΈ κ²λ§ λ¨κΈ΄λ€.
cnt_df = filtered_df[['title', 'parsed_table_contents']]
cnt_df['parsed_table_contents'] = cnt_df['parsed_table_contents'].apply(lambda x: len(x))
cnt_df
<ipython-input-232-f18afb6f8856>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
cnt_df['parsed_table_contents'] = cnt_df['parsed_table_contents'].apply(lambda x: len(x))
title | parsed_table_contents | |
---|---|---|
2 | ββ¦β¦β | 5 |
6 | #Fairy_dancing_in_lake | 0 |
14 | &(μ±κΈ) | 1 |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | 1 |
19 | μ¬ν λΌ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | 0 |
... | ... | ... |
867012 | λ§λ₯Έ λ¨μ | 0 |
867014 | λλ(λ‘μ ν¬λΌμ΄) | 9 |
867015 | νμ μΉμ£Ό | 5 |
867016 | μμ¦μ¬ μμ΄μμ | 7 |
867022 | μνμΉμ£Ό | 5 |
274635 rows Γ 2 columns
filtered_df = filtered_df[filtered_df['title'].isin(cnt_df[cnt_df['parsed_table_contents']!=0]['title'].unique())]
filtered_df
title | text | table_contents | parsed_table_contents | |
---|---|---|---|---|
2 | ββ¦β¦β | ||<-2><tablebordercolor=#878787><tablealign=ri... | [||<-2><tablebordercolor=#878787><tablealign=r... | [{'predicate': ' '''μ½λλͺ ''' ', 'objects': ' ββ¦β¦... |
14 | &(μ±κΈ) | [include(ν:νλ§μ¬ν€ μμ λ―Έμ μ±κΈ)]\n||||||<tablewidth=1... | [||||||<tablewidth=100%> '''[[νλ§μ¬ν€ μμ λ―Έ]]μ μ±κΈ''... | [{'predicate': '<width=40%> '''&''' (2003) ', ... |
18 | νν κΈ°μ°μ€ λ‘λ§λ€μ½©ν° | [include(ν:λ€λ₯Έ λ»1, other1=νμ±, rd1=λ² ν κ²μ°μ€)]\n[inc... | [||<tablealign=center><bgcolor=#ffffff> [[νμΌ:ν... | [{'predicate': '<bgcolor=#007f66> {{{#ffffff '... |
21 | λ κ΅΄λ£¨μ€ μ½λ₯΄λμμ€ | [include(ν:λμ£μ£Όκ΅(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν))]\n\n||<... | [||<bgcolor=#c0c0c0> {{{#ffffff '''λΌμ΄νΈ λ Έλ²¨ μ€μ ''... | [{'predicate': '<bgcolor=#c0c0c0> {{{#ffffff '... |
22 | μν€λλ(Re: μ λ‘λΆν° μμνλ μ΄μΈκ³ μν) | [include(ν:νμμμ )]\n[include(ν:λ§λ (Re: μ λ‘λΆν° μμνλ... | [||<width=57%><bgcolor=#ffffff> [[νμΌ:μν€λλ λΌλ Έλ²¨ ... | [{'predicate': '<width=57%><bgcolor=#ffffff> [... |
... | ... | ... | ... | ... |
867005 | λΆλλ°°λ€(리ν λμ΄νΈλ©μ΄ μ리μ¦) | ||<-2><table width=300><table bordercolor=#000... | [||<-2><table width=300><table bordercolor=#00... | [{'predicate': '<width=50%> [[νμΌ:Male_Bully.pn... |
867014 | λλ(λ‘μ ν¬λΌμ΄) | [[λΆλ₯:λ‘μ ν¬λΌμ΄]]\n[include(ν:μμ λ¬Έμ, top1=λ‘μ ν¬λΌμ΄)]... | [||<tablealign=center><tablewidth=70%><-2><bgc... | [{'predicate': ' {{{#000,#fff '''ν΄λμ€'''}}} ', ... |
867015 | νμ μΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... | [||<-2><tablealign=right><tablewidth=400><tabl... | [{'predicate': '<width=50> '''{{{#gold λ³Έλͺ }}}''... |
867016 | μμ¦μ¬ μμ΄μμ | ||<table width=100%><width=58%><tablebordercol... | [||<table width=100%><width=58%><tableborderco... | [{'predicate': '<rowbgcolor=#ffff99> [[μ μνλλ μ¬... |
867022 | μνμΉμ£Ό | [[λΆλ₯:μ‘°μ μ νκΆ]]\n[include(ν:μ‘°μ μ νκΆ(νμ‘° ~ μ°μ°κ΅°))]\n... | [||<-2><tablealign=right><tablewidth=400><tabl... | [{'predicate': '<width=50> '''{{{#gold μΆμ}}}''... |
199604 rows Γ 4 columns
def refine_triple(triple):
refined_predicate = re.findall(r"'''(.*?)'''", triple['predicate'])
triple_list = []
if not refined_predicate:
return None
else:
refined_predicate = refined_predicate[0]
refined_predicate = refined_predicate.replace('[br]', '')
refined_predicate = refined_predicate.split('|')[0].strip('[[')
if '[[' in triple['objects'] and ']]' not in triple['objects']:
triple['objects'] = triple['objects'] + ']]'
refined_objects = re.findall(r'\[\[(.*?)\]\]', triple['objects'])
for obj in refined_objects:
triple_list.append({'predicate': refined_predicate, 'objects': obj})
return triple_list
def get_refine_triple(triples):
triple_list = []
for triple in triples:
refined_triple = refine_triple(triple)
if refined_triple:
for rt in refined_triple:
if rt['objects']:
triple_list.append(rt)
if len(triple_list) == 0:
return None
return triple_list
def find_korean_text(x):
korean_pattern = re.compile(r"[κ°-ν£]+")
korean_matches = korean_pattern.findall(x)
if len(korean_matches) == 0:
return None
return korean_matches[0]
filtered_df['refined_triple'] = filtered_df['parsed_table_contents'].progress_apply(lambda x: get_refine_triple(x))
triple_df = filtered_df[['title', 'refined_triple']].dropna()
triple_df = triple_df.explode('refined_triple')
triple_df['predicate'] = triple_df['refined_triple'].apply(lambda x: x['predicate'])
triple_df['object'] = triple_df['refined_triple'].apply(lambda x: x['objects'])
triple_df = triple_df[['title', 'predicate', 'object']]
triple_df['predicate'] = triple_df['predicate'].progress_apply(lambda x: find_korean_text(x))
triple_df = triple_df.dropna()
predicate_cnt_df = triple_df.groupby('predicate')['object'].count().reset_index()
filtered_predicates = predicate_cnt_df[predicate_cnt_df['object']>=200]['predicate'].unique()
filtered_triple_df = triple_df[triple_df['predicate'].isin(filtered_predicates)]
filtered_triple_df
filtered_triple_object_refined = copy.deepcopy(filtered_triple_df)
filtered_triple_object_refined['object'] = filtered_triple_object_refined['object'].apply(lambda x: x.split('|')[0].strip())
# μ€λ₯κ° λ§μ보μ΄λ predicate μ μΈ
exclude_predicates = ['κ°κ³ μλ§', 'κ°μ‘±κ΄κ³', 'λν¬', 'μΈκ°κ΄κ³', 'λΆλͺ¨', 'κ°μ‘±', 'νμ μλ§€']
excloude_objects - ['μλ²μ§', 'μ΄λ¨Έλ', 'λλ', 'ν', 'λμ' 'λμ', 'λ¨λμ']
filtered_triple_object_refined = filtered_triple_object_refined[~filtered_triple_object_refined['predicate'].isin(exclude_predicates)]
filtered_triple_object_refined = filtered_triple_object_refined[~filtered_triple_object_refined['object'].isin(exclude_objects)
filtered_triple_object_refined['title'] = filtered_triple_object_refined['title'].apply(lambda x: f'https://namu.wiki/w/{x}')
filtered_triple_object_refined['object'] = filtered_triple_object_refined['object'].apply(lambda x: f'https://namu.wiki/w/{x}')
filtered_triple_object_refined['predicate'] = filtered_triple_object_refined['predicate'].apply(lambda x: f'predicate/{x}')
filtered_triple_object_refined.columns = ['subject', 'predicate', 'object']
filtered_triple_object_refined
# graphDBμ μ¬λ¦¬κΈ° μν΄ rdf κ·μΉμ λ§κ² μμ . κ°μ’
μ€λ₯ μΌμΌν€λ μμΈ μ κ±°
triple = copy.deepcopy(filtered_triple_object_refined)
triple['subject'] = triple['subject'].apply(lambda x:f"<{x.replace('<', '').replace('>', '')}>")
triple['predicate'] = triple['predicate'].apply(lambda x:f"<https://{x.replace('<', '').replace('>', '')}>")
triple['object'] = triple['object'].apply(lambda x:f'<{x}>')
triple['subject'] = triple['subject'].apply(lambda x: '_'.join(x.split()))
triple['predicate'] = triple['predicate'].apply(lambda x: '_'.join(x.split()))
triple['object'] = triple['object'].apply(lambda x: '_'.join(x.split()))
triple = triple[~triple['subject'].str.contains('%|#|\^|\{|\}|`|\|')]
triple = triple[~triple['object'].str.contains('%|#|\^|\{|\}|`|\|')]
triple = triple[~triple['object'].str.contains('.jpg')]
triple = triple[~triple['object'].str.contains('.svg')]
triple = triple[~triple['object'].str.contains('.png')]
triple = triple[~triple['object'].str.contains('νμΌ:')]
triple['subject'] = triple['subject'].apply(lambda x: x.replace('\\', '').replace('"', '').replace(']', '').replace('[', ''))
triple['object'] = triple['object'].apply(lambda x: x.replace('\\', '').replace('"', '').replace(']', '').replace('[', ''))
triple = triple[~triple['subject'].str.contains('RΓ©alta_Nua')]
triple = triple[~triple['subject'].str.contains('/Fate/')]
triple['end'] = '.'