mirror of
https://github.com/krateng/maloja.git
synced 2025-07-09 03:04:07 -04:00
Improved artist / title parsing
This commit is contained in:
parent
ee5bd0998b
commit
144198f933
176
cleanup.py
176
cleanup.py
@ -1,161 +1,65 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
def fullclean(artist,title):
|
def fullclean(artist,title):
|
||||||
artists = cleanup(removespecial(artist))
|
artists = parseArtists(removespecial(artist))
|
||||||
title = cleantitle(removespecial(title))
|
title = parseTitle(removespecial(title))
|
||||||
(title,moreartists) = findartistsintitle(title)
|
(title,moreartists) = parseTitleForArtists(title)
|
||||||
artists += moreartists
|
artists += moreartists
|
||||||
|
|
||||||
return (artists,title)
|
return (list(set(artists)),title)
|
||||||
|
|
||||||
def removespecial(s):
|
def removespecial(s):
|
||||||
return s.replace("\t","").replace("␟","").replace("\n","")
|
return s.replace("\t","").replace("␟","").replace("\n","")
|
||||||
|
|
||||||
def cleanup(artiststr):
|
|
||||||
|
|
||||||
if artiststr == "":
|
delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
|
||||||
|
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
|
||||||
|
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
|
||||||
|
|
||||||
|
|
||||||
|
def parseArtists(a):
|
||||||
|
|
||||||
|
if a.strip() == "":
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
for d in delimiters_feat:
|
||||||
|
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
||||||
|
return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
||||||
|
|
||||||
artists = [artiststr]
|
for d in (delimiters + delimiters_feat):
|
||||||
|
if ((" " + d + " ") in a):
|
||||||
|
ls = []
|
||||||
|
for i in a.split(" " + d + " "):
|
||||||
|
ls += parseArtists(i)
|
||||||
|
return ls
|
||||||
|
|
||||||
artistsnew = []
|
for d in delimiters_formal:
|
||||||
for a in artists:
|
if (d in a):
|
||||||
artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\1",a))
|
ls = []
|
||||||
artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\2",a))
|
for i in a.split(d):
|
||||||
|
ls += parseArtists(i)
|
||||||
artists = artistsnew
|
return ls
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" vs. "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" vs "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" & "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" ft. "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
return [a.strip()]
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
def parseTitle(t):
|
||||||
artistsnew.append(a.split(" Ft. "))
|
t = t.replace("[","(").replace("]",")")
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
t = re.sub(r" \(as made famous by .*?\)","",t)
|
||||||
artistsnew = []
|
t = re.sub(r" \(originally by .*?\)","",t)
|
||||||
|
|
||||||
|
return t
|
||||||
|
|
||||||
for a in artists:
|
def parseTitleForArtists(t):
|
||||||
artistsnew.append(a.split(" Feat. "))
|
for d in delimiters_feat:
|
||||||
|
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
|
||||||
|
(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
|
||||||
|
artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
|
||||||
|
return (title,artists)
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
return (t,[])
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" feat. "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" featuring "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" Featuring "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(" ; "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split("; "))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
for a in artists:
|
|
||||||
artistsnew.append(a.split(";"))
|
|
||||||
|
|
||||||
artists = flatten(artistsnew)
|
|
||||||
artistsnew = []
|
|
||||||
|
|
||||||
#if not artists[0] == artiststr:
|
|
||||||
# print(artiststr + " became " + str(artists))
|
|
||||||
|
|
||||||
return artists
|
|
||||||
|
|
||||||
|
|
||||||
def cleantitle(title):
|
|
||||||
title = title.replace("[","(").replace("]",")")
|
|
||||||
|
|
||||||
title = re.sub(r" \(as made famous by .*?\)","",title)
|
|
||||||
title = re.sub(r" \(originally by .*?\)","",title)
|
|
||||||
|
|
||||||
return title
|
|
||||||
|
|
||||||
def findartistsintitle(title):
|
|
||||||
|
|
||||||
truetitle = title
|
|
||||||
artists = ""
|
|
||||||
|
|
||||||
newtitle = re.sub(r"(.*) \(ft. (.*?)\)",r"\1",title)
|
|
||||||
if (title != newtitle):
|
|
||||||
artists = re.sub(r"(.*) \(ft. (.*?)\).*",r"\2",title)
|
|
||||||
truetitle = newtitle
|
|
||||||
|
|
||||||
newtitle = re.sub(r"(.*) \(feat. (.*?)\)",r"\1",title)
|
|
||||||
if (title != newtitle):
|
|
||||||
artists = re.sub(r"(.*) \(feat. (.*?)\).*",r"\2",title)
|
|
||||||
truetitle = newtitle
|
|
||||||
|
|
||||||
newtitle = re.sub(r"(.*) \(Feat. (.*?)\)",r"\1",title)
|
|
||||||
if (title != newtitle):
|
|
||||||
artists = re.sub(r"(.*) \(Feat. (.*?)\).*",r"\2",title)
|
|
||||||
truetitle = newtitle
|
|
||||||
|
|
||||||
newtitle = re.sub(r"(.*) \(Ft. (.*?)\)",r"\1",title)
|
|
||||||
if (title != newtitle):
|
|
||||||
artists = re.sub(r"(.*) \(Ft. (.*?)\).*",r"\2",title)
|
|
||||||
truetitle = newtitle
|
|
||||||
|
|
||||||
newtitle = re.sub(r"(.*) \(Featuring (.*?)\)",r"\1",title)
|
|
||||||
if (title != newtitle):
|
|
||||||
artists = re.sub(r"(.*) \(Featuring (.*?)\).*",r"\2",title)
|
|
||||||
truetitle = newtitle
|
|
||||||
|
|
||||||
newtitle = re.sub(r"(.*) \(featuring (.*?)\)",r"\1",title)
|
|
||||||
if (title != newtitle):
|
|
||||||
artists = re.sub(r"(.*) \(featuring (.*?)\).*",r"\2",title)
|
|
||||||
truetitle = newtitle
|
|
||||||
|
|
||||||
|
|
||||||
artistlist = cleanup(artists)
|
|
||||||
|
|
||||||
return (truetitle,artistlist)
|
|
||||||
|
|
||||||
def flatten(lis):
|
def flatten(lis):
|
||||||
|
|
||||||
|
@ -118,8 +118,10 @@ def post_scrobble():
|
|||||||
#title = urllib.parse.unquote(keys.get("title"))
|
#title = urllib.parse.unquote(keys.get("title"))
|
||||||
artists = keys.get("artist")
|
artists = keys.get("artist")
|
||||||
title = keys.get("title")
|
title = keys.get("title")
|
||||||
|
time = int(keys.get("time"))
|
||||||
(artists,title) = cleanup.fullclean(artists,title)
|
(artists,title) = cleanup.fullclean(artists,title)
|
||||||
time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
|
if time is None:
|
||||||
|
time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
|
||||||
|
|
||||||
## this is necessary for localhost testing
|
## this is necessary for localhost testing
|
||||||
response.set_header("Access-Control-Allow-Origin","*")
|
response.set_header("Access-Control-Allow-Origin","*")
|
||||||
@ -134,7 +136,6 @@ def post_scrobble():
|
|||||||
@route("/sync")
|
@route("/sync")
|
||||||
def abouttoshutdown():
|
def abouttoshutdown():
|
||||||
sync()
|
sync()
|
||||||
print("Database saved to disk.")
|
|
||||||
#sys.exit()
|
#sys.exit()
|
||||||
|
|
||||||
# Starts the server
|
# Starts the server
|
||||||
@ -298,6 +299,7 @@ def sync():
|
|||||||
|
|
||||||
global lastsync
|
global lastsync
|
||||||
lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
|
lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
|
||||||
|
print("Database saved to disk.")
|
||||||
|
|
||||||
|
|
||||||
# Queries the database
|
# Queries the database
|
||||||
|
@ -13,10 +13,8 @@ for l in log:
|
|||||||
title = data[2]
|
title = data[2]
|
||||||
time = data[3]
|
time = data[3]
|
||||||
|
|
||||||
title = cleanup.cleantitle(title)
|
|
||||||
artists = cleanup.cleanup(artist)
|
(artists,title) = cleanup.fullclean(artist,title)
|
||||||
(title,extraartists) = cleanup.findartistsintitle(title)
|
|
||||||
artists = list(set(artists + extraartists))
|
|
||||||
|
|
||||||
artistsstr = "␟".join(artists)
|
artistsstr = "␟".join(artists)
|
||||||
|
|
||||||
|
4
rules/.gitignore
vendored
4
rules/.gitignore
vendored
@ -1,2 +1,2 @@
|
|||||||
*.csv
|
*.tsv
|
||||||
!examplerules.csv
|
!examplerules.tsv
|
||||||
|
@ -7,8 +7,8 @@
|
|||||||
### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist
|
### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist
|
||||||
###
|
###
|
||||||
### THE RULES IN THIS EXAMPLE FILE ARE IGNORED
|
### THE RULES IN THIS EXAMPLE FILE ARE IGNORED
|
||||||
notanartist,In Dreams,
|
notanartist In Dreams
|
||||||
belongtogether,Darth & Vader,
|
belongtogether Darth & Vader
|
||||||
replacetitle,첫 사랑니 (Rum Pum Pum Pum),Rum Pum Pum Pum
|
replacetitle 첫 사랑니 (Rum Pum Pum Pum) Rum Pum Pum Pum
|
||||||
replaceartist,Dal Shabet,Dal★Shabet
|
replaceartist Dal Shabet Dal★Shabet
|
||||||
countas,Trouble Maker,HyunA
|
countas Trouble Maker HyunA
|
Can't render this file because it contains an unexpected character in line 3 and column 58.
|
Loading…
x
Reference in New Issue
Block a user