Improved artist / title parsing

2025-08-30 23:00:07 -04:00 · 2018-11-28 15:33:30 +01:00 · 2018-11-28 15:33:30 +01:00 · 144198f933
commit 144198f933
parent ee5bd0998b
5 changed files with 58 additions and 154 deletions
--- a/cleanup.py
+++ b/cleanup.py
@ -1,161 +1,65 @@
 import re
 def fullclean(artist,title):
-	artists = cleanup(removespecial(artist))
+	artists = parseArtists(removespecial(artist))
-	title = cleantitle(removespecial(title))
+	title = parseTitle(removespecial(title))
-	(title,moreartists) = findartistsintitle(title)
+	(title,moreartists) = parseTitleForArtists(title)
 	artists += moreartists
-	return (artists,title)
+	return (list(set(artists)),title)
 def removespecial(s):
 	return s.replace("\t","").replace("␟","").replace("\n","")
 def cleanup(artiststr):
-	if artiststr == "":
+delimiters_feat = ["ft.","ft","feat.","feat","featuring"]			#Delimiters used for extra artists, even when in the title field
 delimiters = ["vs.","vs","&"]							#Delimiters in informal titles, spaces expected around them
 delimiters_formal = ["; ",";"]							#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
 def parseArtists(a):
 	if a.strip() == "":
 		return []
 	for d in delimiters_feat:
 		if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
 			return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
 	for d in (delimiters + delimiters_feat):
 		if ((" " + d + " ") in a):
 			ls = []
 			for i in a.split(" " + d + " "):
 				ls += parseArtists(i)
 			return ls
 	for d in delimiters_formal:
 		if (d in a):
 			ls = []
 			for i in a.split(d):
 				ls += parseArtists(i)
 			return ls
 	return [a.strip()]
-	artists = [artiststr]
+def parseTitle(t):
 	t = t.replace("[","(").replace("]",")")
-	artistsnew = []
+	t = re.sub(r" \(as made famous by .*?\)","",t)
-	for a in artists:
+	t = re.sub(r" \(originally by .*?\)","",t)
 		artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\1",a))
 		artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\2",a))
-	artists = artistsnew
+	return t
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" vs. "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" vs "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" & "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" ft. "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" Ft. "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" Feat. "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" feat. "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" featuring "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" Featuring "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(" ; "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split("; "))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	for a in artists:
 		artistsnew.append(a.split(";"))
 	artists = flatten(artistsnew)
 	artistsnew = []
 	#if not artists[0] == artiststr:
 	#	print(artiststr + " became " + str(artists))
 	return artists
 def cleantitle(title):
 	title = title.replace("[","(").replace("]",")")
 	title = re.sub(r" \(as made famous by .*?\)","",title)
 	title = re.sub(r" \(originally by .*?\)","",title)
 	return title
-def findartistsintitle(title):
+def parseTitleForArtists(t):
 	for d in delimiters_feat:
 		if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
 			(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
 			artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
 			return (title,artists)
-	truetitle = title
+	return (t,[])
 	artists = ""
 	newtitle = re.sub(r"(.*) \(ft. (.*?)\)",r"\1",title)
 	if (title != newtitle):
 		artists = re.sub(r"(.*) \(ft. (.*?)\).*",r"\2",title)
 		truetitle = newtitle
 	newtitle = re.sub(r"(.*) \(feat. (.*?)\)",r"\1",title)
 	if (title != newtitle):
 		artists = re.sub(r"(.*) \(feat. (.*?)\).*",r"\2",title)
 		truetitle = newtitle
 	newtitle = re.sub(r"(.*) \(Feat. (.*?)\)",r"\1",title)
 	if (title != newtitle):
 		artists = re.sub(r"(.*) \(Feat. (.*?)\).*",r"\2",title)
 		truetitle = newtitle
 	newtitle = re.sub(r"(.*) \(Ft. (.*?)\)",r"\1",title)
 	if (title != newtitle):
 		artists = re.sub(r"(.*) \(Ft. (.*?)\).*",r"\2",title)
 		truetitle = newtitle
 	newtitle = re.sub(r"(.*) \(Featuring (.*?)\)",r"\1",title)
 	if (title != newtitle):
 		artists = re.sub(r"(.*) \(Featuring (.*?)\).*",r"\2",title)
 		truetitle = newtitle
 	newtitle = re.sub(r"(.*) \(featuring (.*?)\)",r"\1",title)
 	if (title != newtitle):
 		artists = re.sub(r"(.*) \(featuring (.*?)\).*",r"\2",title)
 		truetitle = newtitle
 	artistlist = cleanup(artists)
 	return (truetitle,artistlist)
 def flatten(lis):
--- a/database.py
+++ b/database.py
@ -118,8 +118,10 @@ def post_scrobble():
 	#title = urllib.parse.unquote(keys.get("title"))
 	artists = keys.get("artist")
 	title = keys.get("title")
 	time = int(keys.get("time"))
 	(artists,title) = cleanup.fullclean(artists,title)
-	time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
+	if time is None:
 		time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
 	## this is necessary for localhost testing
 	response.set_header("Access-Control-Allow-Origin","*")
@ -134,7 +136,6 @@ def post_scrobble():
@route("/sync")
 def abouttoshutdown():
 	sync()
 	print("Database saved to disk.")
 	#sys.exit()
 # Starts the server
@ -298,6 +299,7 @@ def sync():
 	global lastsync
 	lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
 	print("Database saved to disk.")
 # Queries the database			
--- a/lastfmconverter.py
+++ b/lastfmconverter.py
@ -13,10 +13,8 @@ for l in log:
 	title = data[2]
 	time = data[3]
-	title = cleanup.cleantitle(title)
+
-	artists = cleanup.cleanup(artist)
+	(artists,title) = cleanup.fullclean(artist,title)
 	(title,extraartists) = cleanup.findartistsintitle(title)
 	artists = list(set(artists + extraartists))
 	artistsstr = "␟".join(artists)
--- a/rules/.gitignore
+++ b/rules/.gitignore
@ -1,2 +1,2 @@
-*.csv
+*.tsv
-!examplerules.csv
+!examplerules.tsv
--- a/rules/examplerules.tsv
+++ b/rules/examplerules.tsv
@ -7,8 +7,8 @@
 ###	countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist
 ###
 ### THE RULES IN THIS EXAMPLE FILE ARE IGNORED
-notanartist,In Dreams,
+notanartist	In Dreams
-belongtogether,Darth & Vader,
+belongtogether	Darth & Vader
-replacetitle,첫 사랑니 (Rum Pum Pum Pum),Rum Pum Pum Pum
+replacetitle	첫 사랑니 (Rum Pum Pum Pum)	Rum Pum Pum Pum
-replaceartist,Dal Shabet,Dal★Shabet
+replaceartist	Dal Shabet			Dal★Shabet
-countas,Trouble Maker,HyunA
+countas		Trouble Maker			HyunA
`@ -1,2 +1,2 @@`
	`*.csv`	`*.tsv`
	`!examplerules.csv`	`!examplerules.tsv`