Improved artist / title parsing

This commit is contained in:
Krateng 2018-11-28 15:33:30 +01:00
parent ee5bd0998b
commit 144198f933
5 changed files with 58 additions and 154 deletions

View File

@ -1,161 +1,65 @@
import re import re
def fullclean(artist,title): def fullclean(artist,title):
artists = cleanup(removespecial(artist)) artists = parseArtists(removespecial(artist))
title = cleantitle(removespecial(title)) title = parseTitle(removespecial(title))
(title,moreartists) = findartistsintitle(title) (title,moreartists) = parseTitleForArtists(title)
artists += moreartists artists += moreartists
return (artists,title) return (list(set(artists)),title)
def removespecial(s): def removespecial(s):
return s.replace("\t","").replace("","").replace("\n","") return s.replace("\t","").replace("","").replace("\n","")
def cleanup(artiststr):
if artiststr == "": delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
def parseArtists(a):
if a.strip() == "":
return [] return []
for d in delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
for d in (delimiters + delimiters_feat):
if ((" " + d + " ") in a):
ls = []
for i in a.split(" " + d + " "):
ls += parseArtists(i)
return ls
for d in delimiters_formal:
if (d in a):
ls = []
for i in a.split(d):
ls += parseArtists(i)
return ls
return [a.strip()]
artists = [artiststr] def parseTitle(t):
t = t.replace("[","(").replace("]",")")
artistsnew = [] t = re.sub(r" \(as made famous by .*?\)","",t)
for a in artists: t = re.sub(r" \(originally by .*?\)","",t)
artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\1",a))
artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\2",a))
artists = artistsnew return t
artistsnew = []
for a in artists:
artistsnew.append(a.split(" vs. "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" vs "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" & "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" ft. "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" Ft. "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" Feat. "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" feat. "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" featuring "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" Featuring "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(" ; "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split("; "))
artists = flatten(artistsnew)
artistsnew = []
for a in artists:
artistsnew.append(a.split(";"))
artists = flatten(artistsnew)
artistsnew = []
#if not artists[0] == artiststr:
# print(artiststr + " became " + str(artists))
return artists
def cleantitle(title):
title = title.replace("[","(").replace("]",")")
title = re.sub(r" \(as made famous by .*?\)","",title)
title = re.sub(r" \(originally by .*?\)","",title)
return title
def findartistsintitle(title): def parseTitleForArtists(t):
for d in delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
return (title,artists)
truetitle = title return (t,[])
artists = ""
newtitle = re.sub(r"(.*) \(ft. (.*?)\)",r"\1",title)
if (title != newtitle):
artists = re.sub(r"(.*) \(ft. (.*?)\).*",r"\2",title)
truetitle = newtitle
newtitle = re.sub(r"(.*) \(feat. (.*?)\)",r"\1",title)
if (title != newtitle):
artists = re.sub(r"(.*) \(feat. (.*?)\).*",r"\2",title)
truetitle = newtitle
newtitle = re.sub(r"(.*) \(Feat. (.*?)\)",r"\1",title)
if (title != newtitle):
artists = re.sub(r"(.*) \(Feat. (.*?)\).*",r"\2",title)
truetitle = newtitle
newtitle = re.sub(r"(.*) \(Ft. (.*?)\)",r"\1",title)
if (title != newtitle):
artists = re.sub(r"(.*) \(Ft. (.*?)\).*",r"\2",title)
truetitle = newtitle
newtitle = re.sub(r"(.*) \(Featuring (.*?)\)",r"\1",title)
if (title != newtitle):
artists = re.sub(r"(.*) \(Featuring (.*?)\).*",r"\2",title)
truetitle = newtitle
newtitle = re.sub(r"(.*) \(featuring (.*?)\)",r"\1",title)
if (title != newtitle):
artists = re.sub(r"(.*) \(featuring (.*?)\).*",r"\2",title)
truetitle = newtitle
artistlist = cleanup(artists)
return (truetitle,artistlist)
def flatten(lis): def flatten(lis):

View File

@ -118,8 +118,10 @@ def post_scrobble():
#title = urllib.parse.unquote(keys.get("title")) #title = urllib.parse.unquote(keys.get("title"))
artists = keys.get("artist") artists = keys.get("artist")
title = keys.get("title") title = keys.get("title")
time = int(keys.get("time"))
(artists,title) = cleanup.fullclean(artists,title) (artists,title) = cleanup.fullclean(artists,title)
time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) if time is None:
time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
## this is necessary for localhost testing ## this is necessary for localhost testing
response.set_header("Access-Control-Allow-Origin","*") response.set_header("Access-Control-Allow-Origin","*")
@ -134,7 +136,6 @@ def post_scrobble():
@route("/sync") @route("/sync")
def abouttoshutdown(): def abouttoshutdown():
sync() sync()
print("Database saved to disk.")
#sys.exit() #sys.exit()
# Starts the server # Starts the server
@ -298,6 +299,7 @@ def sync():
global lastsync global lastsync
lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
print("Database saved to disk.")
# Queries the database # Queries the database

View File

@ -13,10 +13,8 @@ for l in log:
title = data[2] title = data[2]
time = data[3] time = data[3]
title = cleanup.cleantitle(title)
artists = cleanup.cleanup(artist) (artists,title) = cleanup.fullclean(artist,title)
(title,extraartists) = cleanup.findartistsintitle(title)
artists = list(set(artists + extraartists))
artistsstr = "".join(artists) artistsstr = "".join(artists)

4
rules/.gitignore vendored
View File

@ -1,2 +1,2 @@
*.csv *.tsv
!examplerules.csv !examplerules.tsv

View File

@ -7,8 +7,8 @@
### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist ### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist
### ###
### THE RULES IN THIS EXAMPLE FILE ARE IGNORED ### THE RULES IN THIS EXAMPLE FILE ARE IGNORED
notanartist,In Dreams, notanartist In Dreams
belongtogether,Darth & Vader, belongtogether Darth & Vader
replacetitle,첫 사랑니 (Rum Pum Pum Pum),Rum Pum Pum Pum replacetitle 첫 사랑니 (Rum Pum Pum Pum) Rum Pum Pum Pum
replaceartist,Dal Shabet,Dal★Shabet replaceartist Dal Shabet Dal★Shabet
countas,Trouble Maker,HyunA countas Trouble Maker HyunA
Can't render this file because it contains an unexpected character in line 3 and column 58.