编辑2:所以对于这个微笑('CC(C)(C)C1=C(C(=C(C=C1)O)O)O'),距离度量应该是这样的。
[1, 2.0, 3.0, 3.0, 3.0, 4.5, 6.0, 7.5, 6.0, 4.5, 8.5, 7.0, 5.5].
这个向量是按照微笑中字母的顺序排列的。
| SMILES | C-C | C-C-C-C-C | C-C-C@C@ | C-C@C@O- | C-C@C@ |O-C |
| ------------------------------- | -------------- | -------------- | -------------- | -------------- | -------------- |--------------|
| CC(C)(C)C1=C(C(=C(C=C1)O)O)O | 3 | 1 | 1 | 3 | 2 | 3 |
现在我想要的是这个。
| SMILES | C-C | C-C-C-C-C | C-C-C@C@ | C-C@C@O- | C-C@C@ |O-C |
| ------------------------------- | -------------- | -------------- | -------------- | -------------- | -------------- |--------------|
| CC(C)(C)C1=C(C(=C(C=C1)O)O)O | [1, 3, 3] | [2] | [3] | [4.5, 6, 7.5] | [6, 4] | [8.5, 7, 5.5] |
从上表中可以看到,向量中的列数代表组的频率,其值对应于组中每个出现的距离度量,我认为需要一种索引来将定义的每个组与距离度量相关联。
编辑1:我已经在下面发布了我的尝试。
我正在使用rdkit对群进行分类,并从一个字符串(代表一个分子)中得到一个距离度量,我试图创建一个向量,该向量将以官能团为首,其行数等于该官能团在一个微笑(代表分子的字符串)中出现的次数,其中的值是拓扑距离度量。
| 喀喀喀|克-克-|
| - ------|- ------|
| 1.5 |1个|
| 三个|五个|
| 四个||
我有2个代码,1个得到组,1个得到距离。我一直在努力将它们组合起来,以获得上面的输出。我已经尝试过了,但一直在努力。如果有人想看,我可以发送我尝试过的东西。这是我为每个作业制作的两个代码。
一个三个三个一个
有人能指导我如何做这件事,或者更好地告诉我的代码?非常感谢
#attempt 1
import pandas as pd
from rdkit import Chem
from queue import PriorityQueue
def collect_bonded_atoms(smile):
mol = Chem.MolFromSmiles(smile)
atom_counts = {}
distances = {}
for atom in mol.GetAtoms():
charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
neighbors.sort()
key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
atom_counts[key] = atom_counts.get(key, 0) + 1
distances[key] = {}
dijkstra_distances = dijkstra(mol, atom.GetIdx())
for neighbor in atom.GetNeighbors():
neighbor_key = "{}{}-{}".format('',
neighbor.GetSymbol(),
"".join(f"{bond.GetBondTypeAsDouble()}" for bond in mol.GetBonds() if (bond.GetBeginAtomIdx() == atom.GetIdx() and bond.GetEndAtomIdx() == neighbor.GetIdx()) or (bond.GetBeginAtomIdx() == neighbor.GetIdx() and bond.GetEndAtomIdx() == atom.GetIdx())))
neighbor_index = neighbor.GetIdx()
distances[key][neighbor_key] = dijkstra_distances[neighbor_index]
return atom_counts, distances
def dijkstra(mol, start_atom_index):
"""Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
Args:
mol (RDKit Mol object): The input molecule.
start_atom_index (int): The index of the start atom.
Returns:
A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
"""
# Initialize distances to infinity
distances = [float('inf')] * mol.GetNumAtoms()
distances[start_atom_index] = 0
# Initialize priority queue with start atom
pq = PriorityQueue()
pq.put((0, start_atom_index))
# Loop until all atoms have been visited
while not pq.empty():
# Get the next atom with the shortest distance
dist, atom_index = pq.get()
# Update the distances of its neighbors
atom = mol.GetAtomWithIdx(atom_index)
for neighbor in atom.GetNeighbors():
neighbor_index = neighbor.GetIdx()
bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
bond_length = bond.GetBondTypeAsDouble()
new_distance = dist + bond_length
# Update the distance if it is shorter
if new_distance < distances[neighbor_index]:
distances[neighbor_index] = new_distance
pq.put((new_distance, neighbor_index))
return distances
smile_list = ['C1=CC=CC=C1', 'CC(C)C', 'CCO', 'C1=CC=C2C(=C1)C(=CN2)CC=C']
data = []
for smile in smile_list:
atom_counts, distances = collect_bonded_atoms(smile)
for key, value in atom_counts.items():
row = {}
# Add the SMILES to the row
row['SMILES'] = smile
# Add the presence of each group to the row
for group, count in atom_counts.items():
row[group] = 1 if group == key else 0
# Add the distance of each group to the row
for group, dist_dict in distances.items():
if group == key:
continue
dist = dist_dict.get(key, float('inf'))
row[group] = (1, dist) if dist != float('inf') else (0, 0)
data.append(row)
# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)
# Reorder the columns so that SMILES comes first
cols = list(df.columns)
cols.remove('SMILES')
cols = ['SMILES'] + cols
df = df[cols]
# Fill missing values with zeros
df = df.fillna(0)
# Display the DataFrame
print(df)
#attempt 2
from rdkit import Chem
from queue import PriorityQueue
def collect_bonded_atoms(smile):
mol = Chem.MolFromSmiles(smile)
Chem.Kekulize(mol)
atom_counts = {}
for atom in mol.GetAtoms():
symbol = atom.GetSymbol()
charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
neighbors.sort()
key = "{}{}-{}".format(charge, symbol, "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
atom_counts[key] = atom_counts.get(key, 0) + 1
return atom_counts
def dijkstra(mol, start_atom_index):
"""Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
Args:
mol (RDKit Mol object): The input molecule.
start_atom_index (int): The index of the start atom.
Returns:
A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
"""
# Initialize distances to infinity
distances = [float('inf')] * mol.GetNumAtoms()
distances[start_atom_index] = 0
# Initialize priority queue with start atom
pq = PriorityQueue()
pq.put((0, start_atom_index))
# Loop until all atoms have been visited
while not pq.empty():
# Get the next atom with the shortest distance
dist, atom_index = pq.get()
# Update the distances of its neighbors
atom = mol.GetAtomWithIdx(atom_index)
for neighbor in atom.GetNeighbors():
neighbor_index = neighbor.GetIdx()
bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
bond_length = bond.GetBondTypeAsDouble()
new_distance = dist + bond_length
# Update the distance if it is shorter
if new_distance < distances[neighbor_index]:
distances[neighbor_index] = new_distance
pq.put((new_distance, neighbor_index))
return distances
# Define a list of SMILES
smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O', 'C1CCCCC1']
# Create a list to hold the tuples for each SMILES
smile_tuples = []
# Loop through each SMILES and create a tuple for each group and its location
for smile in smile_list:
# Create a dictionary of group frequencies for this SMILES
atom_counts = collect_bonded_atoms(smile)
# Compute the topical distance matrix for this SMILES
mol = Chem.MolFromSmiles(smile)
distances = dijkstra(mol, 0)
# Create a list of tuples for this SMILES, where each tuple is (group, distance)
tuples = []
for group, frequency in atom_counts.items():
symbol = group.split('-', 1)[1].split('-')[0]
for i in range(frequency):
atom_index = Chem.MolFromSmiles(smile).GetSubstructMatches(Chem.MolFromSmiles(symbol))[i][0]
distance = distances[atom_index]
tuples.append((group, distance))
print(tuples)
print(distances)
print(group)
print(symbol)
#attempt 3
import pandas as pd
from rdkit import Chem
from queue import PriorityQueue
def dijkstra(mol, start_atom_index):
"""Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
Args:
mol (RDKit Mol object): The input molecule.
start_atom_index (int): The index of the start atom.
Returns:
A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
"""
# Initialize distances to infinity
distances = [float('inf')] * mol.GetNumAtoms()
distances[start_atom_index] = 0
# Initialize priority queue with start atom
pq = PriorityQueue()
pq.put((0, start_atom_index))
# Loop until all atoms have been visited
while not pq.empty():
# Get the next atom with the shortest distance
dist, atom_index = pq.get()
# Update the distances of its neighbors
atom = mol.GetAtomWithIdx(atom_index)
for neighbor in atom.GetNeighbors():
neighbor_index = neighbor.GetIdx()
bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
bond_length = bond.GetBondTypeAsDouble()
new_distance = dist + bond_length
# Update the distance if it is shorter
if new_distance < distances[neighbor_index]:
distances[neighbor_index] = new_distance
pq.put((new_distance, neighbor_index))
return distances
def collect_bonded_atoms_and_distances(smile):
mol = Chem.MolFromSmiles(smile)
atom_counts = {}
distances = {}
for atom in mol.GetAtoms():
charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
neighbors.sort()
key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
if key not in atom_counts:
atom_counts[key] = 0
distances[key] = []
atom_counts[key] += 1
distances[key].extend(dijkstra(mol, atom.GetIdx()))
return {**atom_counts, **distances}
smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O']
# Create a list of dictionaries for each SMILES
data = []
for smile in smile_list:
counts_and_distances = collect_bonded_atoms_and_distances(smile)
data.append(counts_and_distances)
# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)
# Fill missing values with zeros
df = df.fillna(0)
# Transpose the DataFrame to match the desired output format
df = df.T
# Rename the columns and index
df.columns = [f"Atom {i}" for i in range(1, len(df.columns) + 1)]
df.index.name = 'Functional group'
# Display the final DataFrame
print(df)
#attempt 4
import pandas as pd
from rdkit import Chem
from queue import PriorityQueue
def dijkstra(mol, start_atom_index):
"""Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
Args:
mol (RDKit Mol object): The input molecule.
start_atom_index (int): The index of the start atom.
Returns:
A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
"""
# Initialize distances to infinity
distances = [float('inf')] * mol.GetNumAtoms()
distances[start_atom_index] = 0
# Initialize priority queue with start atom
pq = PriorityQueue()
pq.put((0, start_atom_index))
# Loop until all atoms have been visited
while not pq.empty():
# Get the next atom with the shortest distance
dist, atom_index = pq.get()
# Update the distances of its neighbors
atom = mol.GetAtomWithIdx(atom_index)
for neighbor in atom.GetNeighbors():
neighbor_index = neighbor.GetIdx()
bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
bond_length = bond.GetBondTypeAsDouble()
new_distance = dist + bond_length
# Update the distance if it is shorter
if new_distance < distances[neighbor_index]:
distances[neighbor_index] = new_distance
pq.put((new_distance, neighbor_index))
return distances
def collect_bonded_atoms_and_distances(smile):
mol = Chem.MolFromSmiles(smile)
atom_counts = {}
for atom in mol.GetAtoms():
charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
neighbors.sort()
key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
atom_counts[key] = atom_counts.get(key, 0) + 1
distances = {}
for key in atom_counts:
distances[key] = []
indices = [atom.GetIdx() for atom in mol.GetAtoms() if atom_counts[key] > 0 and atom.GetSmarts() == key]
for index in indices:
dists = dijkstra(mol, index)
for i, d in enumerate(dists):
if i not in distances[key]:
distances[key].append(d)
atom_counts[key] -= 1
return {**atom_counts, **distances}
smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O']
# Create a list of dictionaries for each SMILES
data = []
for smile in smile_list:
counts_and_distances = collect_bonded_atoms_and_distances(smile)
data.append(counts_and_distances)
# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)
# Fill missing values with zeros
df = df.fillna(0)
# Add a column for the SMILES
df.insert(0, 'SMILES', smile_list)
# Display the DataFrame
print(df)
1条答案
按热度按时间fdbelqdn1#
像这样修改函数
collect_bonded_atoms
:根本不要更改函数
dijkstra
。使用如下函数:
老实说:我不知道这是否是你逻辑上想要实现的,但它的输出是你所要求的。
现在,每当
collect_bonded_atoms
找到一个“组”时,索引(我猜是该组的第一个原子)就会被获取并Map到distances-array。我还没有修改代码的功能,在
smile_list
中为多个SMILES创建一个巨大的 Dataframe ,但我想这不是你问题的主要焦点,是吗?让我知道你的情况如何。